diff --git a/cmd/api/api/instances.go b/cmd/api/api/instances.go index 5d329160..ee19d382 100644 --- a/cmd/api/api/instances.go +++ b/cmd/api/api/instances.go @@ -190,6 +190,14 @@ func (s *ApiService) CreateInstance(ctx context.Context, request oapi.CreateInst hvType = hypervisor.Type(*request.Body.Hypervisor) } + // Parse GPU configuration (vGPU mode) + var gpuConfig *instances.GPUConfig + if request.Body.Gpu != nil && request.Body.Gpu.Profile != nil && *request.Body.Gpu.Profile != "" { + gpuConfig = &instances.GPUConfig{ + Profile: *request.Body.Gpu.Profile, + } + } + // Calculate default resource limits when not specified (0 = auto) // Uses proportional allocation based on CPU: (vcpus / cpuCapacity) * resourceCapacity if diskIOBps == 0 { @@ -220,6 +228,7 @@ func (s *ApiService) CreateInstance(ctx context.Context, request oapi.CreateInst Devices: deviceRefs, Volumes: volumes, Hypervisor: hvType, + GPU: gpuConfig, } inst, err := s.InstanceManager.CreateInstance(ctx, domainReq) @@ -685,5 +694,13 @@ func instanceToOAPI(inst instances.Instance) oapi.Instance { oapiInst.Volumes = &oapiVolumes } + // Convert GPU info + if inst.GPUProfile != "" { + oapiInst.Gpu = &oapi.InstanceGPU{ + Profile: lo.ToPtr(inst.GPUProfile), + MdevUuid: lo.ToPtr(inst.GPUMdevUUID), + } + } + return oapiInst } diff --git a/cmd/api/api/resources.go b/cmd/api/api/resources.go index d0f58feb..ccfd2abb 100644 --- a/cmd/api/api/resources.go +++ b/cmd/api/api/resources.go @@ -56,6 +56,12 @@ func (s *ApiService) GetResources(ctx context.Context, _ oapi.GetResourcesReques }) } + // Add GPU status if available + if status.GPU != nil { + gpuStatus := convertGPUResourceStatus(status.GPU) + resp.Gpu = &gpuStatus + } + return oapi.GetResources200JSONResponse(resp), nil } @@ -75,3 +81,38 @@ func convertResourceStatus(rs resources.ResourceStatus) oapi.ResourceStatus { Source: source, } } + +func convertGPUResourceStatus(gs *resources.GPUResourceStatus) oapi.GPUResourceStatus { + result := oapi.GPUResourceStatus{ + Mode: oapi.GPUResourceStatusMode(gs.Mode), + TotalSlots: gs.TotalSlots, + UsedSlots: gs.UsedSlots, + } + + // Convert profiles (vGPU mode) + if len(gs.Profiles) > 0 { + profiles := make([]oapi.GPUProfile, len(gs.Profiles)) + for i, p := range gs.Profiles { + profiles[i] = oapi.GPUProfile{ + Name: p.Name, + FramebufferMb: p.FramebufferMB, + Available: p.Available, + } + } + result.Profiles = &profiles + } + + // Convert devices (passthrough mode) + if len(gs.Devices) > 0 { + devices := make([]oapi.PassthroughDevice, len(gs.Devices)) + for i, d := range gs.Devices { + devices[i] = oapi.PassthroughDevice{ + Name: d.Name, + Available: d.Available, + } + } + result.Devices = &devices + } + + return result +} diff --git a/cmd/api/main.go b/cmd/api/main.go index 18e66a64..54911b5c 100644 --- a/cmd/api/main.go +++ b/cmd/api/main.go @@ -22,6 +22,7 @@ import ( "github.com/onkernel/hypeman" "github.com/onkernel/hypeman/cmd/api/api" "github.com/onkernel/hypeman/cmd/api/config" + "github.com/onkernel/hypeman/lib/devices" "github.com/onkernel/hypeman/lib/guest" "github.com/onkernel/hypeman/lib/hypervisor/qemu" "github.com/onkernel/hypeman/lib/instances" @@ -200,6 +201,26 @@ func run() error { return fmt.Errorf("reconcile device state: %w", err) } + // Reconcile mdev devices (clears orphaned vGPUs from crashed VMs) + // Build mdev info from instances - only destroys mdevs tracked by hypeman + logger.Info("Reconciling mdev devices...") + var mdevInfos []devices.MdevReconcileInfo + if allInstances != nil { + for _, inst := range allInstances { + if inst.GPUMdevUUID != "" { + mdevInfos = append(mdevInfos, devices.MdevReconcileInfo{ + InstanceID: inst.Id, + MdevUUID: inst.GPUMdevUUID, + IsRunning: inst.State == instances.StateRunning || inst.State == instances.StateUnknown, + }) + } + } + } + if err := devices.ReconcileMdevs(app.Ctx, mdevInfos); err != nil { + // Log but don't fail - mdev cleanup is best-effort + logger.Warn("failed to reconcile mdev devices", "error", err) + } + // Initialize ingress manager (starts Caddy daemon and DNS server for dynamic upstreams) logger.Info("Initializing ingress manager...") if err := app.IngressManager.Initialize(app.Ctx); err != nil { diff --git a/integration/vgpu_test.go b/integration/vgpu_test.go new file mode 100644 index 00000000..73793707 --- /dev/null +++ b/integration/vgpu_test.go @@ -0,0 +1,244 @@ +package integration + +import ( + "bytes" + "context" + "os" + "testing" + "time" + + "github.com/onkernel/hypeman/cmd/api/config" + "github.com/onkernel/hypeman/lib/devices" + "github.com/onkernel/hypeman/lib/guest" + "github.com/onkernel/hypeman/lib/hypervisor" + "github.com/onkernel/hypeman/lib/images" + "github.com/onkernel/hypeman/lib/instances" + "github.com/onkernel/hypeman/lib/network" + "github.com/onkernel/hypeman/lib/paths" + "github.com/onkernel/hypeman/lib/system" + "github.com/onkernel/hypeman/lib/volumes" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestVGPU is an integration test that verifies vGPU (SR-IOV mdev) support works. +// +// This test automatically detects vGPU availability and skips if: +// - No SR-IOV VFs are found in /sys/class/mdev_bus/ +// - No vGPU profiles are available +// - Not running as root (required for mdev creation) +// - KVM is not available +// +// To run manually: +// +// sudo go test -v -run TestVGPU -timeout 5m ./integration/... +// +// Note: This test verifies mdev creation and PCI device visibility inside the VM. +// It does NOT test nvidia-smi or CUDA functionality since that requires NVIDIA +// guest drivers pre-installed in the image. +func TestVGPU(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + // Auto-detect vGPU availability - skip if prerequisites not met + skipReason, profile := checkVGPUTestPrerequisites() + if skipReason != "" { + t.Skip(skipReason) + } + + t.Logf("vGPU test prerequisites met, using profile: %s", profile) + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer cancel() + + // Set up test environment + tmpDir := t.TempDir() + p := paths.New(tmpDir) + + cfg := &config.Config{ + DataDir: tmpDir, + BridgeName: "vmbr0", + SubnetCIDR: "10.100.0.0/16", + DNSServer: "1.1.1.1", + } + + // Create managers + imageManager, err := images.NewManager(p, 1, nil) + require.NoError(t, err) + + systemManager := system.NewManager(p) + networkManager := network.NewManager(p, cfg, nil) + deviceManager := devices.NewManager(p) + volumeManager := volumes.NewManager(p, 0, nil) + + limits := instances.ResourceLimits{ + MaxOverlaySize: 100 * 1024 * 1024 * 1024, + MaxVcpusPerInstance: 0, + MaxMemoryPerInstance: 0, + MaxTotalVcpus: 0, + MaxTotalMemory: 0, + } + + instanceManager := instances.NewManager(p, imageManager, systemManager, networkManager, deviceManager, volumeManager, limits, "", nil, nil) + + // Track instance ID for cleanup + var instanceID string + + // Cleanup any orphaned instances and mdevs + t.Cleanup(func() { + if instanceID != "" { + t.Log("Cleanup: Deleting instance...") + instanceManager.DeleteInstance(ctx, instanceID) + } + }) + + // Step 1: Ensure system files (kernel, initrd) + t.Log("Step 1: Ensuring system files...") + err = systemManager.EnsureSystemFiles(ctx) + require.NoError(t, err) + t.Log("System files ready") + + // Step 2: Pull alpine image (lightweight for testing) + imageName := "docker.io/library/alpine:latest" + t.Log("Step 2: Pulling alpine image...") + _, err = imageManager.CreateImage(ctx, images.CreateImageRequest{ + Name: imageName, + }) + require.NoError(t, err) + + // Wait for image to be ready + t.Log("Waiting for image build...") + var img *images.Image + for i := 0; i < 120; i++ { + img, err = imageManager.GetImage(ctx, imageName) + if err == nil && img.Status == images.StatusReady { + break + } + if img != nil && img.Status == images.StatusFailed { + errMsg := "unknown" + if img.Error != nil { + errMsg = *img.Error + } + t.Fatalf("Image build failed: %s", errMsg) + } + time.Sleep(1 * time.Second) + } + require.NotNil(t, img, "Image should exist") + require.Equal(t, images.StatusReady, img.Status, "Image should be ready") + t.Log("Image ready") + + // Step 3: Create instance with vGPU using QEMU hypervisor + // QEMU is required for vGPU/mdev passthrough with NVIDIA's vGPU manager + t.Log("Step 3: Creating instance with vGPU profile:", profile) + inst, err := instanceManager.CreateInstance(ctx, instances.CreateInstanceRequest{ + Name: "vgpu-test", + Image: imageName, + Size: 2 * 1024 * 1024 * 1024, // 2GB + HotplugSize: 512 * 1024 * 1024, + OverlaySize: 1024 * 1024 * 1024, + Vcpus: 2, + NetworkEnabled: false, // No network needed for this test + Hypervisor: hypervisor.TypeQEMU, + GPU: &instances.GPUConfig{ + Profile: profile, + }, + }) + require.NoError(t, err) + instanceID = inst.Id + t.Logf("Instance created: %s", inst.Id) + + // Verify mdev UUID was assigned + require.NotEmpty(t, inst.GPUMdevUUID, "Instance should have mdev UUID assigned") + t.Logf("mdev UUID: %s", inst.GPUMdevUUID) + + // Step 4: Verify mdev was created in sysfs + t.Run("MdevCreated", func(t *testing.T) { + mdevPath := "/sys/bus/mdev/devices/" + inst.GPUMdevUUID + _, err := os.Stat(mdevPath) + assert.NoError(t, err, "mdev device should exist at %s", mdevPath) + t.Logf("mdev exists at: %s", mdevPath) + }) + + // Step 5: Wait for guest agent to be ready + t.Log("Step 5: Waiting for guest agent...") + err = waitForGuestAgent(ctx, instanceManager, inst.Id, 60*time.Second) + require.NoError(t, err, "guest agent should be ready") + + // Step 6: Verify GPU is visible inside VM via PCI + t.Run("GPUVisibleInVM", func(t *testing.T) { + actualInst, err := instanceManager.GetInstance(ctx, inst.Id) + require.NoError(t, err) + + dialer, err := hypervisor.NewVsockDialer(actualInst.HypervisorType, actualInst.VsockSocket, actualInst.VsockCID) + require.NoError(t, err) + + // Check for NVIDIA vendor ID (0x10de) in guest PCI devices + var stdout, stderr bytes.Buffer + checkGPUCmd := "cat /sys/bus/pci/devices/*/vendor 2>/dev/null | grep -i 10de && echo 'NVIDIA_FOUND' || echo 'NO_NVIDIA'" + + _, err = guest.ExecIntoInstance(ctx, dialer, guest.ExecOptions{ + Command: []string{"/bin/sh", "-c", checkGPUCmd}, + Stdout: &stdout, + Stderr: &stderr, + TTY: false, + }) + require.NoError(t, err, "exec should work") + + output := stdout.String() + t.Logf("GPU check output: %s", output) + + assert.Contains(t, output, "NVIDIA_FOUND", "NVIDIA GPU (vendor 0x10de) should be visible in guest") + }) + + // Step 7: Check instance GPU info is correct + t.Run("InstanceGPUInfo", func(t *testing.T) { + actualInst, err := instanceManager.GetInstance(ctx, inst.Id) + require.NoError(t, err) + + assert.Equal(t, profile, actualInst.GPUProfile, "GPU profile should match") + assert.NotEmpty(t, actualInst.GPUMdevUUID, "mdev UUID should be set") + t.Logf("Instance GPU: profile=%s, mdev=%s", actualInst.GPUProfile, actualInst.GPUMdevUUID) + }) + + t.Log("✅ vGPU test PASSED!") +} + +// checkVGPUTestPrerequisites checks if vGPU test can run. +// Returns (skipReason, profileName) - skipReason is empty if all prerequisites are met. +func checkVGPUTestPrerequisites() (string, string) { + // Check KVM + if _, err := os.Stat("/dev/kvm"); os.IsNotExist(err) { + return "vGPU test requires /dev/kvm", "" + } + + // Check for root (required for mdev creation via sysfs) + if os.Geteuid() != 0 { + return "vGPU test requires root (sudo) for mdev creation", "" + } + + // Check for vGPU mode (SR-IOV VFs present) + mode := devices.DetectHostGPUMode() + if mode != devices.GPUModeVGPU { + return "vGPU test requires SR-IOV VFs in /sys/class/mdev_bus/", "" + } + + // Check for available profiles + profiles, err := devices.ListGPUProfiles() + if err != nil { + return "vGPU test failed to list profiles: " + err.Error(), "" + } + if len(profiles) == 0 { + return "vGPU test requires at least one GPU profile", "" + } + + // Find a profile with available instances + for _, p := range profiles { + if p.Available > 0 { + return "", p.Name + } + } + + return "vGPU test requires at least one available VF (all VFs are in use)", "" +} + diff --git a/lib/devices/GPU.md b/lib/devices/GPU.md index 55c73673..73243637 100644 --- a/lib/devices/GPU.md +++ b/lib/devices/GPU.md @@ -1,161 +1,263 @@ -# GPU Passthrough Support +# GPU and vGPU Support -This document covers NVIDIA GPU passthrough specifics. For general device passthrough, see [README.md](README.md). +This document covers GPU passthrough and vGPU (SR-IOV) support in hypeman. -## How GPU Passthrough Works +## Overview -hypeman supports NVIDIA GPU passthrough via VFIO, with automatic driver injection: +hypeman supports two GPU modes, automatically detected based on host configuration: + +| Mode | Description | Use Case | +|------|-------------|----------| +| **vGPU (SR-IOV)** | Virtual GPUs via mdev on SR-IOV VFs | Multi-tenant, shared GPU resources | +| **Passthrough** | Whole GPU VFIO passthrough | Dedicated GPU per instance | + +The host's GPU mode is determined by the host driver configuration: +- If `/sys/class/mdev_bus/` contains VFs → vGPU mode +- If NVIDIA GPUs are available for VFIO → passthrough mode + +## vGPU Mode (Recommended) + +vGPU mode uses NVIDIA's SR-IOV technology to create Virtual Functions (VFs), each capable of hosting an mdev (mediated device) representing a vGPU. + +### How It Works ``` ┌─────────────────────────────────────────────────────────────────────┐ -│ hypeman Initrd (built at startup) │ +│ Physical GPU (e.g., NVIDIA L40S) │ │ ┌──────────────────────────────────────────────────────────────┐ │ -│ │ /lib/modules//kernel/drivers/gpu/ │ │ -│ │ ├── nvidia.ko │ │ -│ │ ├── nvidia-uvm.ko │ │ -│ │ ├── nvidia-modeset.ko │ │ -│ │ └── nvidia-drm.ko │ │ -│ ├──────────────────────────────────────────────────────────────┤ │ -│ │ /usr/lib/nvidia/ │ │ -│ │ ├── libcuda.so.570.86.16 │ │ -│ │ ├── libnvidia-ml.so.570.86.16 │ │ -│ │ ├── libnvidia-ptxjitcompiler.so.570.86.16 │ │ -│ │ └── ... (other driver libraries) │ │ +│ │ SR-IOV Virtual Functions (VFs) │ │ +│ │ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ │ +│ │ │ VF 0 │ │ VF 1 │ │ VF 2 │ │ VF 3 │ ... │ │ +│ │ │ mdev │ │ (avail) │ │ mdev │ │ (avail) │ │ │ +│ │ │ L40S-1Q │ │ │ │ L40S-2Q │ │ │ │ │ +│ │ └─────────┘ └─────────┘ └─────────┘ └─────────┘ │ │ │ └──────────────────────────────────────────────────────────────┘ │ -└─────────────────────────────────────────────────────────────────────┘ - │ - ▼ (at VM boot, if HAS_GPU=1) -┌─────────────────────────────────────────────────────────────────────┐ -│ Guest VM │ -│ 1. Load kernel modules (modprobe nvidia, etc.) │ -│ 2. Create device nodes (/dev/nvidia0, /dev/nvidiactl, etc.) │ -│ 3. Copy driver libs to container rootfs │ -│ 4. Run ldconfig to update library cache │ -│ 5. Container can now use GPU! │ └─────────────────────────────────────────────────────────────────────┘ ``` -## Container Image Requirements +### Available Profiles -With driver injection, containers **do not need** to bundle NVIDIA driver libraries. +Query available profiles via the resources API: -**Minimal CUDA image example:** +```bash +curl -s http://localhost:8080/resources | jq .gpu +``` -```dockerfile -FROM nvidia/cuda:12.4-runtime-ubuntu22.04 -# Your application - no driver installation needed! -RUN pip install torch -CMD ["python", "train.py"] +```json +{ + "mode": "vgpu", + "total_slots": 64, + "used_slots": 5, + "profiles": [ + {"name": "L40S-1Q", "framebuffer_mb": 1024, "available": 59}, + {"name": "L40S-2Q", "framebuffer_mb": 2048, "available": 30}, + {"name": "L40S-4Q", "framebuffer_mb": 4096, "available": 16} + ] +} ``` -hypeman injects the following at boot: +### Creating an Instance with vGPU -- `libcuda.so` - CUDA driver API -- `libnvidia-ml.so` - NVML (nvidia-smi, monitoring) -- `libnvidia-ptxjitcompiler.so` - PTX JIT compilation -- `libnvidia-nvvm.so` - NVVM compiler -- `libnvidia-gpucomp.so` - GPU compute library -- `nvidia-smi` binary -- `nvidia-modprobe` binary +Request a vGPU by specifying the profile name: -## Driver Version Compatibility +```bash +curl -X POST http://localhost:8080/instances \ + -H "Content-Type: application/json" \ + -d '{ + "name": "ml-training", + "image": "nvidia/cuda:12.4-runtime-ubuntu22.04", + "vcpus": 4, + "size": "8GB", + "gpu": { + "profile": "L40S-1Q" + } + }' +``` -The driver libraries injected by hypeman are pinned to a specific version that matches the kernel modules. This version is tracked in: +The response includes the assigned mdev UUID: + +```json +{ + "id": "abc123", + "name": "ml-training", + "gpu": { + "profile": "L40S-1Q", + "mdev_uuid": "aa618089-8b16-4d01-a136-25a0f3c73123" + } +} +``` -- **Kernel release:** `onkernel/linux` GitHub releases (e.g., `ch-6.12.8-kernel-2-20251211`) -- **hypeman config:** `lib/system/versions.go` - `NvidiaDriverVersion` map +### Ephemeral mdev Lifecycle -### Current Driver Version +mdev devices are **ephemeral**: created on instance start, destroyed on instance delete. -| Kernel Version | Driver Version | Release Date | -|---------------|----------------|--------------| -| ch-6.12.8-kernel-2-20251211 | 570.86.16 | 2025-12-11 | +``` +Instance Create → Create mdev → Attach to VM → Instance Running +Instance Delete → Stop VM → Destroy mdev → VF available again +``` -### CUDA Compatibility +This ensures: +- **Security**: No VRAM data leakage between instances +- **Clean state**: Fresh vGPU for each instance +- **Automatic cleanup**: Orphaned mdevs cleaned up on server restart -Driver 570.86.16 supports CUDA 12.4 and earlier. Check [NVIDIA's compatibility matrix](https://docs.nvidia.com/deploy/cuda-compatibility/) for details. +## Passthrough Mode -## Upgrading the Driver +Passthrough mode assigns entire physical GPUs to instances via VFIO. -To upgrade the NVIDIA driver version: +### Checking Available GPUs -1. **Choose a new version** from [NVIDIA's Linux drivers](https://www.nvidia.com/Download/index.aspx) +```bash +curl -s http://localhost:8080/resources | jq .gpu +``` -2. **Update onkernel/linux:** - - Edit `.github/workflows/release.yaml` - - Change `DRIVER_VERSION=` in all locations (search for the current version) - - The workflow file contains comments explaining what to update - - Create a new release tag (e.g., `ch-6.12.8-kernel-2-YYYYMMDD`) +```json +{ + "mode": "passthrough", + "total_slots": 4, + "used_slots": 2, + "devices": [ + {"name": "NVIDIA L40S", "available": true}, + {"name": "NVIDIA L40S", "available": false} + ] +} +``` -3. **Update hypeman:** - - Edit `lib/system/versions.go` - - Add new `KernelVersion` constant - - Update `DefaultKernelVersion` - - Update `NvidiaDriverVersion` map entry - - Update `NvidiaModuleURLs` with new release URL - - Update `NvidiaDriverLibURLs` with new release URL +### Using Passthrough -4. **Test thoroughly** before deploying: - - Run GPU passthrough E2E tests - - Verify with real CUDA workloads (e.g., ollama inference) +For whole-GPU passthrough, use the devices API (see [README.md](README.md)): -## Supported GPUs +```bash +# Register GPU +curl -X POST http://localhost:8080/devices \ + -d '{"pci_address": "0000:82:00.0", "name": "gpu-0"}' -All NVIDIA datacenter GPUs supported by the open-gpu-kernel-modules are supported: +# Create instance with GPU +curl -X POST http://localhost:8080/instances \ + -d '{"name": "ml-job", "image": "nvidia/cuda:12.4", "devices": ["gpu-0"]}' +``` -- NVIDIA H100, H200 -- NVIDIA L4, L40, L40S -- NVIDIA A100, A10, A30 -- NVIDIA T4 -- And other Turing/Ampere/Hopper/Ada Lovelace architecture GPUs +## Guest Driver Requirements -Consumer GPUs (GeForce) are **not** supported by the open kernel modules. +**Important**: hypeman does NOT inject NVIDIA drivers into guest VMs. The guest image must include pre-installed NVIDIA drivers. -## Troubleshooting +### Recommended Base Images -### nvidia-smi shows wrong driver version +Use NVIDIA's official CUDA images with driver utilities: -The driver version shown by nvidia-smi should match hypeman's configured version. If it differs, the container may have its own driver libraries that are taking precedence. Either: +```dockerfile +FROM nvidia/cuda:12.4.1-runtime-ubuntu22.04 -- Use a minimal CUDA runtime image without driver libs -- Or ensure the container's driver version matches +# Install NVIDIA driver userspace utilities +RUN apt-get update && \ + apt-get install -y nvidia-utils-550 && \ + rm -rf /var/lib/apt/lists/* -### CUDA initialization failed +# Your application +COPY app /app +CMD ["/app/run"] +``` -Check that: +### Driver Version Compatibility -1. Kernel modules are loaded: `cat /proc/modules | grep nvidia` -2. Device nodes exist: `ls -la /dev/nvidia*` -3. Libraries are in LD_LIBRARY_PATH: `ldconfig -p | grep nvidia` +The guest driver version must be compatible with: +- The host's NVIDIA vGPU Manager (for vGPU mode) +- The CUDA toolkit version your application requires -### Driver/library version mismatch +Check NVIDIA's [vGPU Documentation](https://docs.nvidia.com/grid/) for compatibility matrices. -Error like `NVML_ERROR_LIB_RM_VERSION_MISMATCH` means the userspace library version doesn't match the kernel module version. This shouldn't happen with hypeman's automatic injection, but can occur if the container has its own driver libraries. +## API Reference -**Solution:** Use a base image that doesn't include driver libraries, or ensure any bundled libraries match the hypeman driver version. +### GET /resources -### GPU not detected in container +Returns GPU status along with other resources: -1. Verify the GPU was attached to the instance: +```json +{ + "cpu": { ... }, + "memory": { ... }, + "gpu": { + "mode": "vgpu", + "total_slots": 64, + "used_slots": 5, + "profiles": [ + {"name": "L40S-1Q", "framebuffer_mb": 1024, "available": 59} + ] + } +} +``` + +### POST /instances (with GPU) + +```json +{ + "name": "my-instance", + "image": "nvidia/cuda:12.4-runtime", + "gpu": { + "profile": "L40S-1Q" + } +} +``` + +### Instance Response + +```json +{ + "id": "abc123", + "gpu": { + "profile": "L40S-1Q", + "mdev_uuid": "aa618089-8b16-4d01-a136-25a0f3c73123" + } +} +``` + +## Troubleshooting + +### No GPU shown in /resources + +1. Check host GPU mode detection: ```bash - hypeman instance get | jq .devices + ls /sys/class/mdev_bus/ # Should show VFs for vGPU mode ``` -2. Check the VM console log for module loading errors: +2. Verify NVIDIA drivers are loaded on host: ```bash - cat /var/lib/hypeman/instances//console.log | grep -i nvidia + nvidia-smi ``` -3. Verify VFIO binding on the host: +### Profile not available + +The requested profile may require more VRAM than available. Check: +```bash +curl -s http://localhost:8080/resources | jq '.gpu.profiles' +``` + +### nvidia-smi fails in guest + +1. Verify guest image has NVIDIA drivers installed +2. Check driver version compatibility with vGPU Manager +3. Inspect guest boot logs: ```bash - ls -la /sys/bus/pci/devices//driver + curl http://localhost:8080/instances//logs?source=app + ``` + +### mdev creation fails + +1. Check if VFs are available: + ```bash + ls /sys/class/mdev_bus/ + ``` + +2. Verify mdev types: + ```bash + cat /sys/class/mdev_bus/*/mdev_supported_types/*/available_instances ``` ## Performance Tuning ### Huge Pages -For best GPU performance, enable huge pages on the host: +For best vGPU performance, enable huge pages on the host: ```bash echo 1024 > /proc/sys/vm/nr_hugepages @@ -163,7 +265,7 @@ echo 1024 > /proc/sys/vm/nr_hugepages ### IOMMU Configuration -Ensure IOMMU is properly configured: +Ensure IOMMU is properly configured for either mode: ```bash # Intel @@ -173,5 +275,16 @@ intel_iommu=on iommu=pt amd_iommu=on iommu=pt ``` -The `iommu=pt` (passthrough) option improves performance for devices not using VFIO. +## Supported Hardware + +### vGPU Mode (SR-IOV) +- NVIDIA L40, L40S +- NVIDIA A100 (with appropriate vGPU license) +- Other NVIDIA GPUs supporting SR-IOV +### Passthrough Mode +All NVIDIA datacenter GPUs supported by open-gpu-kernel-modules: +- NVIDIA H100, H200 +- NVIDIA L4, L40, L40S +- NVIDIA A100, A10, A30 +- NVIDIA T4 diff --git a/lib/devices/README.md b/lib/devices/README.md index 0e34e662..3621b04d 100644 --- a/lib/devices/README.md +++ b/lib/devices/README.md @@ -1,86 +1,83 @@ # Device Passthrough -This package provides GPU and PCI device passthrough for virtual machines using the Linux VFIO (Virtual Function I/O) framework. +This package provides GPU, vGPU, and PCI device passthrough for virtual machines. ## Overview -Device passthrough allows a VM to have direct, near-native access to physical hardware (GPUs, network cards, etc.) by bypassing the host's device drivers and giving the guest exclusive control. For a deep dive into the VFIO framework, see the [kernel documentation](https://docs.kernel.org/driver-api/vfio.html). +hypeman supports two GPU modes: -``` -┌─────────────────────────────────────────────────────────────┐ -│ Host │ -│ ┌─────────────┐ ┌─────────────────────────────────┐ │ -│ │ hypeman │ │ VFIO Driver │ │ -│ │ (VMM) │────▶│ /dev/vfio/ │ │ -│ └─────────────┘ └─────────────────────────────────┘ │ -│ │ │ -│ ┌───────────────────────────┼──────────────────────────┐ │ -│ │ IOMMU (hardware) ▼ │ │ -│ │ - Translates guest physical → host physical │ │ -│ │ - Isolates DMA (device can only access VM memory) │ │ -│ └──────────────────────────────────────────────────────┘ │ -│ │ │ -│ ▼ │ -│ ┌──────────────┐ │ -│ │ GPU (PCIe) │ │ -│ └──────────────┘ │ -└─────────────────────────────────────────────────────────────┘ -``` +| Mode | Description | Use Case | +|------|-------------|----------| +| **vGPU (SR-IOV)** | Virtual GPUs via mdev on SR-IOV VFs | Multi-tenant, shared GPU resources | +| **Passthrough** | Whole GPU VFIO passthrough | Dedicated GPU per instance | + +For GPU-specific documentation, see [GPU.md](GPU.md). ## Package Structure ``` lib/devices/ -├── types.go # Device, AvailableDevice, CreateDeviceRequest -├── errors.go # Error definitions -├── discovery.go # PCI device discovery from sysfs -├── vfio.go # VFIO bind/unbind operations -├── manager.go # Manager interface and implementation -├── manager_test.go # Unit tests -├── gpu_e2e_test.go # End-to-end GPU passthrough test (auto-skips if no GPU) +├── types.go # Device, AvailableDevice, GPUProfile, MdevDevice types +├── errors.go # Error definitions +├── discovery.go # PCI device discovery from sysfs +├── vfio.go # VFIO bind/unbind operations +├── gpu_mode.go # GPU mode detection (vGPU vs passthrough) +├── mdev.go # mdev lifecycle (create, destroy, list, reconcile) +├── manager.go # Manager interface and implementation +├── manager_test.go # Unit tests +├── gpu_e2e_test.go # End-to-end GPU passthrough test +├── GPU.md # GPU and vGPU documentation └── scripts/ - └── gpu-reset.sh # GPU recovery script (see Troubleshooting) + └── gpu-reset.sh # GPU recovery script ``` -## Example: Full Workflow +## Quick Start + +### vGPU Mode (Recommended for Multi-Tenant) ```bash -# 1. Discover available devices +# Check available profiles +curl localhost:8080/resources | jq .gpu + +# Create instance with vGPU +curl -X POST localhost:8080/instances \ + -H "Content-Type: application/json" \ + -d '{ + "name": "ml-training", + "image": "nvidia/cuda:12.4-runtime-ubuntu22.04", + "gpu": {"profile": "L40S-1Q"} + }' + +# Inside VM: verify GPU +nvidia-smi +``` + +### Passthrough Mode (Dedicated GPU) + +```bash +# Discover available devices curl localhost:8080/devices/available -# → [{"pci_address": "0000:a2:00.0", "vendor_name": "NVIDIA Corporation", ...}] -# 2. Register the GPU +# Register the GPU curl -X POST localhost:8080/devices \ -d '{"name": "l4-gpu", "pci_address": "0000:a2:00.0"}' -# 3. Create instance with GPU (auto-binds to VFIO) +# Create instance with GPU curl -X POST localhost:8080/instances \ -d '{"name": "ml-training", "image": "nvidia/cuda:12.0-base", "devices": ["l4-gpu"]}' -# 4. Inside VM: verify GPU -lspci | grep -i nvidia +# Inside VM: verify GPU nvidia-smi -# 5. Delete instance (auto-unbinds from VFIO) +# Delete instance (auto-unbinds from VFIO) curl -X DELETE localhost:8080/instances/{id} -# GPU returns to host control ``` ## Device Lifecycle -### 1. Discovery +### Registration (Passthrough Mode) -Discover passthrough-capable devices on the host: - -``` -GET /devices/available -``` - -Returns PCI devices that are candidates for passthrough (GPUs, 3D controllers). Each device includes its PCI address, vendor/device IDs, IOMMU group, and current driver. - -### 2. Registration - -Register a device with a unique name: +Register a device for whole-GPU passthrough: ``` POST /devices @@ -90,73 +87,56 @@ POST /devices } ``` -Registration does not modify the device's driver binding. The device remains usable by the host until an instance requests it. - -### 3. Instance Creation (Auto-Bind) - -When an instance is created with devices: +### Instance Creation -``` -POST /instances +**vGPU Mode:** +```json { "name": "gpu-workload", - "image": "docker.io/nvidia/cuda:12.0-base", - "devices": ["l4-gpu"] + "image": "nvidia/cuda:12.4-runtime", + "gpu": {"profile": "L40S-1Q"} } ``` -The system automatically: -1. **Validates** the device exists and isn't attached to another instance -2. **Binds to VFIO** if not already bound (unbinds native driver like `nvidia`) -3. **Passes to cloud-hypervisor** via the `--device` flag -4. **Marks as attached** to prevent concurrent use +**Passthrough Mode:** +```json +{ + "name": "gpu-workload", + "image": "nvidia/cuda:12.4-runtime", + "devices": ["l4-gpu"] +} +``` -### 4. Instance Deletion (Auto-Unbind) +### Automatic Cleanup -When an instance is deleted, the system automatically: -1. **Marks device as detached** -2. **Unbinds from VFIO** (triggers kernel driver probe to restore native driver) +- **vGPU**: mdev destroyed when instance is deleted +- **Passthrough**: Device unbound from VFIO when instance is deleted +- **Orphaned mdevs**: Cleaned up on server startup -This returns the device to host control so it can be used by other processes or a new instance. +## Hypervisor Integration -### 5. Unregistration +Both Cloud Hypervisor and QEMU receive device paths: +**VFIO passthrough:** ``` -DELETE /devices/{id} +/sys/bus/pci/devices/0000:a2:00.0/ ``` -Removes the device from hypeman's registry. Fails if the device is currently attached to an instance. - -## Cloud Hypervisor Integration - -Cloud-hypervisor receives device passthrough configuration via the `VmConfig.Devices` field: - -```go -vmConfig.Devices = &[]vmm.DeviceConfig{ - { - Path: "/sys/bus/pci/devices/0000:a2:00.0/", - }, -} +**mdev (vGPU):** +``` +/sys/bus/mdev/devices// ``` -Cloud-hypervisor then: -1. Opens the VFIO group file (`/dev/vfio/`) -2. Maps device BARs (memory regions) into guest physical address space -3. Configures interrupt routing (MSI/MSI-X) to the guest -4. The guest sees a real PCIe device and loads native drivers - -### NVIDIA-Specific Options +## Guest Driver Requirements -For multi-GPU configurations, cloud-hypervisor supports GPUDirect P2P: +**Important**: Guest images must include pre-installed NVIDIA drivers. -```go -DeviceConfig{ - Path: "/sys/bus/pci/devices/0000:a2:00.0/", - XNvGpudirectClique: ptr(int8(0)), // Enable P2P within clique 0 -} +```dockerfile +FROM nvidia/cuda:12.4.1-runtime-ubuntu22.04 +RUN apt-get update && apt-get install -y nvidia-utils-550 ``` -This is not currently exposed through the hypeman API but could be added for HPC workloads. +hypeman does NOT inject drivers into guests. ## Constraints and Limitations @@ -164,288 +144,68 @@ This is not currently exposed through the hypeman API but could be added for HPC - **IOMMU must be enabled** in BIOS and kernel (`intel_iommu=on` or `amd_iommu=on`) - All devices in an IOMMU group must be passed through together -- Some motherboards place many devices in the same group (ACS override may help) ### VFIO Module Requirements -The following kernel modules must be loaded: ```bash modprobe vfio_pci modprobe vfio_iommu_type1 ``` -### Driver Binding - -- Binding to VFIO **unloads the native driver** (e.g., `nvidia`, `amdgpu`) -- Host processes using the device will lose access -- Some drivers (like NVIDIA) may resist unbinding if in use - ### Single Attachment -A device can only be attached to one instance at a time. Attempts to attach an already-attached device will fail. +A device (or vGPU profile slot) can only be attached to one instance at a time. ### No Hot-Plug -Devices must be specified at instance creation time. Hot-adding devices to a running VM is not currently supported (though cloud-hypervisor has this capability). - -### Guest Driver Requirements - -The guest must have appropriate drivers: -- **NVIDIA GPUs**: Install NVIDIA drivers in the guest image -- **AMD GPUs**: Install amdgpu/ROCm in the guest image - -### Performance Considerations - -- **ACS (Access Control Services)**: Required for proper isolation on some systems -- **Huge Pages**: Recommended for GPU workloads (`hugepages=on` in cloud-hypervisor) -- **CPU Pinning**: Can improve latency for GPU compute workloads +Devices must be specified at instance creation time. ## Troubleshooting ### GPU Reset Script -If GPU passthrough tests fail or hang, the GPU may be left in a bad state (still bound to vfio-pci, or stuck without a driver). Use the provided reset script: +If GPU passthrough tests fail or hang: ```bash -# Reset all NVIDIA GPUs to their native driver +# Reset all NVIDIA GPUs sudo ./lib/devices/scripts/gpu-reset.sh -# Reset a specific GPU +# Reset specific GPU sudo ./lib/devices/scripts/gpu-reset.sh 0000:a2:00.0 ``` -The script will: -1. Kill any stuck cloud-hypervisor processes holding the GPU -2. Unbind from vfio-pci if still bound -3. Clear `driver_override` -4. Trigger driver probe to rebind to the nvidia driver -5. Restart `nvidia-persistenced` - ### Common Issues #### VFIO Bind Hangs -**Symptom**: `BindToVFIO` hangs indefinitely. - -**Cause**: The `nvidia-persistenced` service keeps `/dev/nvidia*` open, preventing driver unbind. - -**Solution**: The code now automatically stops `nvidia-persistenced` before unbinding. If you're testing manually: +**Solution**: Code automatically stops `nvidia-persistenced`. For manual testing: ```bash sudo systemctl stop nvidia-persistenced -# ... do VFIO bind/unbind ... -sudo systemctl start nvidia-persistenced ``` -#### VM Exec Fails After Boot - -**Symptom**: VM boots but exec commands time out. - -**Cause**: Usually the container's main process exited (e.g., `alpine` image runs `/bin/sh` which exits immediately), causing init to exit and the VM to kernel panic. - -**Solution**: Use an image with a long-running process (e.g., `nginx:alpine`) or ensure your container has a persistent entrypoint. - #### GPU Not Restored After Test -**Symptom**: GPU has no driver bound, `nvidia-smi` fails. - -**Solution**: ```bash -# Trigger kernel driver probe sudo sh -c 'echo 0000:a2:00.0 > /sys/bus/pci/drivers_probe' -# Restart nvidia-persistenced sudo systemctl start nvidia-persistenced -# Verify -nvidia-smi ``` -If that fails, a system **reboot** may be necessary. - -#### VFIO Modules Not Loaded +#### vGPU Profile Not Available -**Symptom**: `ErrVFIONotAvailable` error. - -**Solution**: +Check available slots: ```bash -sudo modprobe vfio_pci vfio_iommu_type1 -# Verify -ls /dev/vfio/ +curl localhost:8080/resources | jq '.gpu.profiles' ``` -Add to `/etc/modules-load.d/vfio.conf` for persistence across reboots. - -#### IOMMU Not Enabled - -**Symptom**: No IOMMU groups found, passthrough fails. - -**Solution**: Add kernel parameter to bootloader: -- Intel: `intel_iommu=on iommu=pt` -- AMD: `amd_iommu=on iommu=pt` - -Then reboot. - ### Running the E2E Test -The GPU passthrough E2E test **automatically detects** GPU availability and skips if prerequisites aren't met. - -**Why GPU tests require root**: Unlike network tests which can use Linux capabilities (`CAP_NET_ADMIN`), GPU passthrough requires writing to sysfs files (`/sys/bus/pci/drivers/*/unbind`, etc.) which are protected by standard Unix file permissions (owned by root, mode 0200). Capabilities don't bypass DAC (discretionary access control) for file writes. - -Prerequisites for the test to run (not skip): -- **Root permissions** (sudo) - required for sysfs driver operations -- NVIDIA GPU on host -- IOMMU enabled (`intel_iommu=on` or `amd_iommu=on`) -- `vfio_pci` and `vfio_iommu_type1` modules loaded -- `/sbin` in PATH (for `mkfs.ext4`) - ```bash -# Prepare the environment +# Prerequisites sudo modprobe vfio_pci vfio_iommu_type1 -# Run via make - test auto-skips if not root or no GPU -make test - -# Or run directly with sudo +# Run test (auto-skips if no GPU) sudo env PATH=$PATH:/sbin:/usr/sbin \ go test -v -run TestGPUPassthrough -timeout 5m ./lib/devices/... ``` -The test will: -1. Check prerequisites and skip if not met (not root, no GPU, no IOMMU, etc.) -2. Discover available NVIDIA GPUs -3. Register the first GPU found -4. Create a VM with GPU passthrough -5. Verify the GPU is visible inside the VM -6. Clean up (delete VM, unbind from VFIO, restore nvidia driver) - -## Future Plans: GPU Sharing Across Multiple VMs - -### The Problem - -With current VFIO passthrough, a GPU is assigned **exclusively** to one VM. To share a single GPU across multiple VMs (e.g., give each VM a "slice"), you need NVIDIA's **vGPU (GRID)** technology. - -### Why MIG Alone Doesn't Help - -**MIG (Multi-Instance GPU)** partitions a GPU into isolated instances at the hardware level, but: - -- MIG partitions are **not separate PCI devices**—the GPU remains one PCI endpoint -- MIG partitions are accessed via CUDA APIs (`CUDA_VISIBLE_DEVICES=MIG-`) -- You can only VFIO-passthrough the **whole GPU** to one VM -- MIG is useful for workload isolation **within** a single host or VM, not for multi-VM sharing - -``` -Physical GPU (0000:a2:00.0) ─── still ONE PCI device - └── MIG partitions (logical, not separate devices) - ├── MIG Instance 0 ─┐ - ├── MIG Instance 1 ─┼── All accessed via CUDA on the same GPU - └── MIG Instance 2 ─┘ -``` - -**Supported MIG Hardware**: A100, A30, H100, H200 (NOT L4 or consumer GPUs) - -### vGPU/mdev: The Only Path to Multi-VM GPU Sharing - -To assign GPU shares to **separate VMs**, NVIDIA requires their **vGPU (GRID)** technology, which uses the Linux mediated device (mdev) framework. - -#### Cloud-Hypervisor mdev Support Status - -Cloud-hypervisor **does** support mdev passthrough: - -```bash -cloud-hypervisor --device path=/sys/bus/mdev/devices// -``` - -However, NVIDIA's proprietary vGPU manager has a QEMU-specific quirk: it reads the VMM process's `/proc//cmdline` looking for a `-uuid` argument to map mdev UUIDs to VMs. This doesn't work out-of-the-box with cloud-hypervisor. - -**Workarounds** (from [cloud-hypervisor#5319](https://github.com/cloud-hypervisor/cloud-hypervisor/issues/5319)): -- Patch CH to accept a dummy `-uuid` flag -- Use wrapper scripts that inject the UUID into the process name -- Wait for NVIDIA to fix their driver's VMM assumptions - -#### vGPU Requirements - -- **Hardware**: Datacenter GPUs (A100, L40, etc.) -- **Licensing**: NVIDIA GRID subscription ($$/GPU/year) -- **Host Software**: NVIDIA vGPU Manager installed on host -- **Guest Drivers**: vGPU-aware guest drivers - -### Design Changes for mdev/vGPU Support - -#### 1. New Device Type: `MdevDevice` - -```go -type MdevDevice struct { - UUID string // mdev instance UUID - ParentGPU string // PCI address of parent GPU - Type string // vGPU type (e.g., "nvidia-256") - Available bool // Not assigned to a VM -} -``` - -#### 2. Discovery Extensions - -```go -// List mdev types supported by a GPU -func (m *manager) ListMdevTypes(ctx context.Context, pciAddress string) ([]MdevType, error) - -// List existing mdev instances -func (m *manager) ListMdevInstances(ctx context.Context) ([]MdevDevice, error) - -// Create an mdev instance -func (m *manager) CreateMdevInstance(ctx context.Context, pciAddress, mdevType string) (*MdevDevice, error) - -// Destroy an mdev instance -func (m *manager) DestroyMdevInstance(ctx context.Context, uuid string) error -``` - -#### 3. Passthrough Mechanism - -mdev devices use a different sysfs path: - -``` -# mdev device path -/sys/bus/mdev/devices// - -# vs VFIO-PCI (current) -/sys/bus/pci/devices/0000:a2:00.0/ -``` - -Cloud-hypervisor's `--device` flag already accepts mdev paths. - -#### 4. NVIDIA vGPU Workaround - -To work around NVIDIA's QEMU-specific UUID detection, we may need to: -- Add a `--platform uuid=` option to cloud-hypervisor invocation -- Or use a wrapper that sets the process name appropriately - -### Implementation Phases - -**Phase 1**: mdev Discovery & Passthrough -- Detect mdev-capable GPUs -- List available mdev types and instances -- Pass mdev devices to VMs (path already works) - -**Phase 2**: mdev Lifecycle Management -- Create/destroy mdev instances via sysfs -- API endpoints for mdev management - -**Phase 3**: NVIDIA vGPU Integration -- Implement UUID workaround for NVIDIA's driver -- Test with GRID licensing -- Document guest driver requirements - -### How vGPU + MIG Work Together - -vGPU creates mdev devices that can be backed by MIG partitions, giving you both hardware isolation (MIG) and multi-VM assignment (vGPU): - -``` -Physical GPU (one PCI device) - │ - ├── Without vGPU: VFIO passthrough gives whole GPU to ONE VM - │ - └── With vGPU (GRID license required): - └── MIG Mode enabled on host - ├── MIG Instance 0 ──→ vGPU mdev A ──→ VM 1 - ├── MIG Instance 1 ──→ vGPU mdev B ──→ VM 2 - └── MIG Instance 2 ──→ vGPU mdev C ──→ VM 3 -``` - -Without vGPU, MIG is only useful for workload isolation on the host or within a single VM that owns the whole GPU. +**Why root is required**: sysfs driver operations require writing to files owned by root with mode 0200. diff --git a/lib/devices/gpu_e2e_test.go b/lib/devices/gpu_e2e_test.go index e279ed51..e245190c 100644 --- a/lib/devices/gpu_e2e_test.go +++ b/lib/devices/gpu_e2e_test.go @@ -35,6 +35,10 @@ import ( // // sudo env PATH=$PATH:/sbin:/usr/sbin go test -v -run TestGPUPassthrough ./lib/devices/... // +// Note: This test only verifies PCI device visibility (vendor ID 0x10de), not +// driver functionality. To test nvidia-smi or CUDA, use an image with pre-installed +// NVIDIA guest drivers (e.g., nvidia/cuda with nvidia-utils-550). +// // WARNING: This test will unbind the GPU from the nvidia driver, which may // disrupt other processes using the GPU. The test attempts to restore the // nvidia driver binding on cleanup. @@ -142,7 +146,10 @@ func TestGPUPassthrough(t *testing.T) { require.NoError(t, err) t.Log("System files ready") - // Step 4: Pull nginx:alpine (nginx keeps running unlike plain alpine which exits immediately) + // Step 4: Pull nginx:alpine image + // Note: This image doesn't have NVIDIA drivers, but that's fine - this test only + // verifies PCI device visibility. For full GPU functionality tests, use an image + // with pre-installed NVIDIA guest drivers. t.Log("Step 4: Pulling nginx:alpine image...") createdImg, createErr := imageMgr.CreateImage(ctx, images.CreateImageRequest{ Name: "docker.io/library/nginx:alpine", diff --git a/lib/devices/gpu_mode.go b/lib/devices/gpu_mode.go new file mode 100644 index 00000000..40b3b2ba --- /dev/null +++ b/lib/devices/gpu_mode.go @@ -0,0 +1,30 @@ +package devices + +import ( + "os" +) + +// DetectHostGPUMode determines the host's GPU configuration mode. +// +// Returns: +// - GPUModeVGPU if /sys/class/mdev_bus has entries (SR-IOV VFs present) +// - GPUModePassthrough if NVIDIA GPUs are available for VFIO passthrough +// - GPUModeNone if no GPUs are available +// +// Note: A host is configured for either vGPU or passthrough, not both, +// because the host driver determines which mode is available. +func DetectHostGPUMode() GPUMode { + // Check for vGPU mode first (SR-IOV VFs present) + entries, err := os.ReadDir("/sys/class/mdev_bus") + if err == nil && len(entries) > 0 { + return GPUModeVGPU + } + + // Check for passthrough mode (physical GPUs available) + gpus, err := DiscoverAvailableDevices() + if err == nil && len(gpus) > 0 { + return GPUModePassthrough + } + + return GPUModeNone +} diff --git a/lib/devices/mdev.go b/lib/devices/mdev.go new file mode 100644 index 00000000..fd2334c8 --- /dev/null +++ b/lib/devices/mdev.go @@ -0,0 +1,568 @@ +package devices + +import ( + "bufio" + "context" + "encoding/json" + "fmt" + "os" + "os/exec" + "path/filepath" + "regexp" + "strconv" + "strings" + "sync" + + "github.com/google/uuid" + "github.com/onkernel/hypeman/lib/logger" +) + +const ( + mdevBusPath = "/sys/class/mdev_bus" + mdevDevices = "/sys/bus/mdev/devices" +) + +// mdevMu protects mdev creation/destruction to prevent race conditions +// when multiple instances request vGPUs concurrently. +var mdevMu sync.Mutex + +// profileMetadata holds static profile info (doesn't change after driver load) +type profileMetadata struct { + TypeName string // e.g., "nvidia-1145" + Name string // e.g., "NVIDIA L40S-1B" + FramebufferMB int +} + +// cachedProfiles holds static profile metadata, loaded once on first access +var ( + cachedProfiles []profileMetadata + cachedProfilesOnce sync.Once +) + +// DiscoverVFs returns all SR-IOV Virtual Functions available for vGPU. +// These are discovered by scanning /sys/class/mdev_bus/ which contains +// VFs that can host mdev devices. +func DiscoverVFs() ([]VirtualFunction, error) { + entries, err := os.ReadDir(mdevBusPath) + if err != nil { + if os.IsNotExist(err) { + return nil, nil // No mdev_bus means no vGPU support + } + return nil, fmt.Errorf("read mdev_bus: %w", err) + } + + // List mdevs once and build a lookup map to avoid O(n*m) performance + mdevs, _ := ListMdevDevices() + mdevByVF := make(map[string]bool, len(mdevs)) + for _, mdev := range mdevs { + mdevByVF[mdev.VFAddress] = true + } + + var vfs []VirtualFunction + for _, entry := range entries { + vfAddr := entry.Name() + + // Find parent GPU by checking physfn symlink + // VFs have a physfn symlink pointing to their parent Physical Function + physfnPath := filepath.Join("/sys/bus/pci/devices", vfAddr, "physfn") + parentGPU := "" + if target, err := os.Readlink(physfnPath); err == nil { + parentGPU = filepath.Base(target) + } + + // Check if this VF already has an mdev (using pre-built lookup map) + hasMdev := mdevByVF[vfAddr] + + vfs = append(vfs, VirtualFunction{ + PCIAddress: vfAddr, + ParentGPU: parentGPU, + HasMdev: hasMdev, + }) + } + + return vfs, nil +} + +// ListGPUProfiles returns available vGPU profiles with availability counts. +// Profiles are discovered from the first VF's mdev_supported_types directory. +func ListGPUProfiles() ([]GPUProfile, error) { + vfs, err := DiscoverVFs() + if err != nil { + return nil, err + } + return ListGPUProfilesWithVFs(vfs) +} + +// ListGPUProfilesWithVFs returns available vGPU profiles using pre-discovered VFs. +// This avoids redundant VF discovery when the caller already has the list. +func ListGPUProfilesWithVFs(vfs []VirtualFunction) ([]GPUProfile, error) { + if len(vfs) == 0 { + return nil, nil + } + + // Load static profile metadata once (cached indefinitely) + cachedProfilesOnce.Do(func() { + cachedProfiles = loadProfileMetadata(vfs[0].PCIAddress) + }) + + // Build result with dynamic availability counts + profiles := make([]GPUProfile, 0, len(cachedProfiles)) + for _, meta := range cachedProfiles { + profiles = append(profiles, GPUProfile{ + Name: meta.Name, + FramebufferMB: meta.FramebufferMB, + Available: countAvailableVFsForProfile(vfs, meta.TypeName), + }) + } + + return profiles, nil +} + +// loadProfileMetadata reads static profile info from sysfs (called once) +func loadProfileMetadata(firstVF string) []profileMetadata { + typesPath := filepath.Join(mdevBusPath, firstVF, "mdev_supported_types") + entries, err := os.ReadDir(typesPath) + if err != nil { + return nil + } + + var profiles []profileMetadata + for _, entry := range entries { + if !entry.IsDir() { + continue + } + + typeName := entry.Name() + typeDir := filepath.Join(typesPath, typeName) + + nameBytes, err := os.ReadFile(filepath.Join(typeDir, "name")) + if err != nil { + continue + } + + profiles = append(profiles, profileMetadata{ + TypeName: typeName, + Name: strings.TrimSpace(string(nameBytes)), + FramebufferMB: parseFramebufferFromDescription(typeDir), + }) + } + + return profiles +} + +// parseFramebufferFromDescription extracts framebuffer size from profile description +func parseFramebufferFromDescription(typeDir string) int { + descBytes, err := os.ReadFile(filepath.Join(typeDir, "description")) + if err != nil { + return 0 + } + + // Description format varies but typically contains "framebuffer=1024M" or similar + desc := string(descBytes) + + // Try to find framebuffer size in MB + re := regexp.MustCompile(`framebuffer=(\d+)M`) + if matches := re.FindStringSubmatch(desc); len(matches) > 1 { + if mb, err := strconv.Atoi(matches[1]); err == nil { + return mb + } + } + + // Also try comma-separated format like "num_heads=4, frl_config=60, framebuffer=1024M" + scanner := bufio.NewScanner(strings.NewReader(desc)) + for scanner.Scan() { + line := scanner.Text() + if strings.Contains(line, "framebuffer") { + parts := strings.Split(line, ",") + for _, part := range parts { + part = strings.TrimSpace(part) + if strings.HasPrefix(part, "framebuffer=") { + sizeStr := strings.TrimPrefix(part, "framebuffer=") + sizeStr = strings.TrimSuffix(sizeStr, "M") + if mb, err := strconv.Atoi(sizeStr); err == nil { + return mb + } + } + } + } + } + + return 0 +} + +// countAvailableVFsForProfile counts available instances for a profile type. +// Optimized: all VFs on the same parent GPU have identical profile support, +// so we only sample one VF per parent instead of reading from every VF. +func countAvailableVFsForProfile(vfs []VirtualFunction, profileType string) int { + if len(vfs) == 0 { + return 0 + } + + // Group free VFs by parent GPU + freeVFsByParent := make(map[string][]VirtualFunction) + for _, vf := range vfs { + if vf.HasMdev { + continue + } + freeVFsByParent[vf.ParentGPU] = append(freeVFsByParent[vf.ParentGPU], vf) + } + + count := 0 + for _, parentVFs := range freeVFsByParent { + if len(parentVFs) == 0 { + continue + } + // Sample just ONE VF per parent - all VFs on same parent have same profiles + sampleVF := parentVFs[0] + availPath := filepath.Join(mdevBusPath, sampleVF.PCIAddress, "mdev_supported_types", profileType, "available_instances") + data, err := os.ReadFile(availPath) + if err != nil { + continue + } + instances, err := strconv.Atoi(strings.TrimSpace(string(data))) + if err != nil || instances < 1 { + continue + } + // Profile is available - count all free VFs on this parent + count += len(parentVFs) + } + return count +} + +// findProfileType finds the internal type name (e.g., "nvidia-556") for a profile name (e.g., "L40S-1Q") +func findProfileType(profileName string) (string, error) { + vfs, err := DiscoverVFs() + if err != nil || len(vfs) == 0 { + return "", fmt.Errorf("no VFs available") + } + + firstVF := vfs[0].PCIAddress + typesPath := filepath.Join(mdevBusPath, firstVF, "mdev_supported_types") + entries, err := os.ReadDir(typesPath) + if err != nil { + return "", fmt.Errorf("read mdev_supported_types: %w", err) + } + + for _, entry := range entries { + if !entry.IsDir() { + continue + } + typeName := entry.Name() + nameBytes, err := os.ReadFile(filepath.Join(typesPath, typeName, "name")) + if err != nil { + continue + } + if strings.TrimSpace(string(nameBytes)) == profileName { + return typeName, nil + } + } + + return "", fmt.Errorf("profile %q not found", profileName) +} + +// mdevctlDevice represents the JSON structure from mdevctl list +type mdevctlDevice struct { + Start string `json:"start,omitempty"` + MdevType string `json:"mdev_type,omitempty"` + ManuallyDef bool `json:"manually_defined,omitempty"` + ParentDevice string `json:"parent,omitempty"` +} + +// ListMdevDevices returns all active mdev devices on the host. +func ListMdevDevices() ([]MdevDevice, error) { + // Try mdevctl first + output, err := exec.Command("mdevctl", "list", "-d", "--dumpjson").Output() + if err == nil && len(output) > 0 { + return parseMdevctlOutput(output) + } + + // Fallback to sysfs scanning + return scanMdevDevices() +} + +// parseMdevctlOutput parses the JSON output from mdevctl list +func parseMdevctlOutput(output []byte) ([]MdevDevice, error) { + // mdevctl outputs: { "uuid": { ... }, "uuid2": { ... } } + var rawMap map[string][]mdevctlDevice + if err := json.Unmarshal(output, &rawMap); err != nil { + return nil, fmt.Errorf("parse mdevctl output: %w", err) + } + + var mdevs []MdevDevice + for uuid, devices := range rawMap { + if len(devices) == 0 { + continue + } + dev := devices[0] + + // Get profile name from mdev type + profileName := getProfileNameFromType(dev.MdevType, dev.ParentDevice) + + mdevs = append(mdevs, MdevDevice{ + UUID: uuid, + VFAddress: dev.ParentDevice, + ProfileType: dev.MdevType, + ProfileName: profileName, + SysfsPath: filepath.Join(mdevDevices, uuid), + InstanceID: "", // Not tracked by mdevctl, we track separately + }) + } + + return mdevs, nil +} + +// scanMdevDevices scans /sys/bus/mdev/devices for active mdevs +func scanMdevDevices() ([]MdevDevice, error) { + entries, err := os.ReadDir(mdevDevices) + if err != nil { + if os.IsNotExist(err) { + return nil, nil + } + return nil, fmt.Errorf("read mdev devices: %w", err) + } + + var mdevs []MdevDevice + for _, entry := range entries { + uuid := entry.Name() + mdevPath := filepath.Join(mdevDevices, uuid) + + // Read mdev_type symlink to get profile type + typeLink, err := os.Readlink(filepath.Join(mdevPath, "mdev_type")) + if err != nil { + continue + } + profileType := filepath.Base(typeLink) + + // Get parent VF from symlink + parentLink, err := os.Readlink(mdevPath) + if err != nil { + continue + } + // Parent path looks like ../../../devices/pci.../0000:82:00.4/uuid + parts := strings.Split(parentLink, "/") + vfAddress := "" + for i, p := range parts { + if strings.HasPrefix(p, "0000:") && i+1 < len(parts) && parts[i+1] == uuid { + vfAddress = p + break + } + } + + profileName := getProfileNameFromType(profileType, vfAddress) + + mdevs = append(mdevs, MdevDevice{ + UUID: uuid, + VFAddress: vfAddress, + ProfileType: profileType, + ProfileName: profileName, + SysfsPath: mdevPath, + InstanceID: "", + }) + } + + return mdevs, nil +} + +// getProfileNameFromType resolves internal type (nvidia-556) to profile name (L40S-1Q) +func getProfileNameFromType(profileType, vfAddress string) string { + if vfAddress == "" { + return profileType // Fallback to type if no VF + } + + namePath := filepath.Join(mdevBusPath, vfAddress, "mdev_supported_types", profileType, "name") + data, err := os.ReadFile(namePath) + if err != nil { + return profileType + } + return strings.TrimSpace(string(data)) +} + +// CreateMdev creates an mdev device for the given profile and instance. +// It finds an available VF and creates the mdev, returning the device info. +// This function is thread-safe and uses a mutex to prevent race conditions +// when multiple instances request vGPUs concurrently. +func CreateMdev(ctx context.Context, profileName, instanceID string) (*MdevDevice, error) { + log := logger.FromContext(ctx) + + // Lock to prevent race conditions when multiple instances request the same profile + mdevMu.Lock() + defer mdevMu.Unlock() + + // Find profile type from name + profileType, err := findProfileType(profileName) + if err != nil { + return nil, err + } + + // Find an available VF + vfs, err := DiscoverVFs() + if err != nil { + return nil, fmt.Errorf("discover VFs: %w", err) + } + + var targetVF string + for _, vf := range vfs { + // Check if this VF can create the profile + availPath := filepath.Join(mdevBusPath, vf.PCIAddress, "mdev_supported_types", profileType, "available_instances") + data, err := os.ReadFile(availPath) + if err != nil { + continue + } + instances, err := strconv.Atoi(strings.TrimSpace(string(data))) + if err != nil || instances < 1 { + continue + } + targetVF = vf.PCIAddress + break + } + + if targetVF == "" { + return nil, fmt.Errorf("no available VF for profile %q", profileName) + } + + // Generate UUID for the mdev + mdevUUID := uuid.New().String() + + log.DebugContext(ctx, "creating mdev device", "profile", profileName, "vf", targetVF, "uuid", mdevUUID, "instance_id", instanceID) + + // Create mdev by writing UUID to create file + createPath := filepath.Join(mdevBusPath, targetVF, "mdev_supported_types", profileType, "create") + if err := os.WriteFile(createPath, []byte(mdevUUID), 0200); err != nil { + return nil, fmt.Errorf("create mdev on VF %s: %w", targetVF, err) + } + + log.InfoContext(ctx, "created mdev device", "profile", profileName, "vf", targetVF, "uuid", mdevUUID, "instance_id", instanceID) + + return &MdevDevice{ + UUID: mdevUUID, + VFAddress: targetVF, + ProfileType: profileType, + ProfileName: profileName, + SysfsPath: filepath.Join(mdevDevices, mdevUUID), + InstanceID: instanceID, + }, nil +} + +// DestroyMdev removes an mdev device. +func DestroyMdev(ctx context.Context, mdevUUID string) error { + log := logger.FromContext(ctx) + + // Lock to prevent race conditions during destruction + mdevMu.Lock() + defer mdevMu.Unlock() + + log.DebugContext(ctx, "destroying mdev device", "uuid", mdevUUID) + + // Try mdevctl undefine first (removes persistent definition) + if err := exec.Command("mdevctl", "undefine", "--uuid", mdevUUID).Run(); err != nil { + // Log at debug level - mdevctl might not be installed or mdev might not be defined + log.DebugContext(ctx, "mdevctl undefine failed (may be expected)", "uuid", mdevUUID, "error", err) + } + + // Remove via sysfs + removePath := filepath.Join(mdevDevices, mdevUUID, "remove") + if err := os.WriteFile(removePath, []byte("1"), 0200); err != nil { + if os.IsNotExist(err) { + log.DebugContext(ctx, "mdev already removed", "uuid", mdevUUID) + return nil // Already removed + } + return fmt.Errorf("remove mdev %s: %w", mdevUUID, err) + } + + log.InfoContext(ctx, "destroyed mdev device", "uuid", mdevUUID) + return nil +} + +// IsMdevInUse checks if an mdev device is currently bound to a driver (in use by a VM). +// An mdev with a driver symlink is actively attached to a hypervisor/VFIO. +func IsMdevInUse(mdevUUID string) bool { + driverPath := filepath.Join(mdevDevices, mdevUUID, "driver") + _, err := os.Readlink(driverPath) + return err == nil // Has a driver = in use +} + +// MdevReconcileInfo contains information needed to reconcile mdevs for an instance +type MdevReconcileInfo struct { + InstanceID string + MdevUUID string + IsRunning bool // true if instance's VMM is running or state is unknown +} + +// ReconcileMdevs destroys orphaned mdevs that belong to hypeman but are no longer in use. +// This is called on server startup to clean up stale mdevs from previous runs. +// +// Safety guarantees: +// - Only destroys mdevs that are tracked by hypeman instances (via hypemanMdevs map) +// - Never destroys mdevs created by other processes on the host +// - Skips mdevs that are currently bound to a driver (in use by a VM) +// - Skips mdevs for instances in Running or Unknown state +func ReconcileMdevs(ctx context.Context, instanceInfos []MdevReconcileInfo) error { + log := logger.FromContext(ctx) + + mdevs, err := ListMdevDevices() + if err != nil { + return fmt.Errorf("list mdevs: %w", err) + } + + if len(mdevs) == 0 { + log.DebugContext(ctx, "no mdev devices found to reconcile") + return nil + } + + // Build lookup maps from instance info + // mdevUUID -> instanceID for mdevs managed by hypeman + hypemanMdevs := make(map[string]string, len(instanceInfos)) + // instanceID -> isRunning for liveness check + instanceRunning := make(map[string]bool, len(instanceInfos)) + for _, info := range instanceInfos { + if info.MdevUUID != "" { + hypemanMdevs[info.MdevUUID] = info.InstanceID + instanceRunning[info.InstanceID] = info.IsRunning + } + } + + log.InfoContext(ctx, "reconciling mdev devices", "total_mdevs", len(mdevs), "hypeman_mdevs", len(hypemanMdevs)) + + var destroyed, skippedNotOurs, skippedInUse, skippedRunning int + for _, mdev := range mdevs { + // Only consider mdevs that hypeman created + instanceID, isOurs := hypemanMdevs[mdev.UUID] + if !isOurs { + log.DebugContext(ctx, "skipping mdev not managed by hypeman", "uuid", mdev.UUID, "profile", mdev.ProfileName) + skippedNotOurs++ + continue + } + + // Skip if instance is running or in unknown state (might still be using the mdev) + if instanceRunning[instanceID] { + log.DebugContext(ctx, "skipping mdev for running/unknown instance", "uuid", mdev.UUID, "instance_id", instanceID) + skippedRunning++ + continue + } + + // Check if mdev is bound to a driver (in use by VM) + if IsMdevInUse(mdev.UUID) { + log.WarnContext(ctx, "skipping mdev still bound to driver", "uuid", mdev.UUID, "instance_id", instanceID) + skippedInUse++ + continue + } + + // Safe to destroy - it's ours, instance is not running, and not bound to driver + log.InfoContext(ctx, "destroying orphaned mdev", "uuid", mdev.UUID, "profile", mdev.ProfileName, "instance_id", instanceID) + if err := DestroyMdev(ctx, mdev.UUID); err != nil { + // Log error but continue - best effort cleanup + log.WarnContext(ctx, "failed to destroy orphaned mdev", "uuid", mdev.UUID, "error", err) + continue + } + destroyed++ + } + + log.InfoContext(ctx, "mdev reconciliation complete", + "destroyed", destroyed, + "skipped_not_ours", skippedNotOurs, + "skipped_in_use", skippedInUse, + "skipped_running", skippedRunning, + ) + + return nil +} diff --git a/lib/devices/testdata/ollama-cuda/Dockerfile b/lib/devices/testdata/ollama-cuda/Dockerfile index d31107ff..9c2e06c7 100644 --- a/lib/devices/testdata/ollama-cuda/Dockerfile +++ b/lib/devices/testdata/ollama-cuda/Dockerfile @@ -1,18 +1,19 @@ # Minimal CUDA image for GPU inference testing # -# NO NVIDIA DRIVER INSTALLATION NEEDED! -# hypeman automatically injects the matching driver libraries at VM boot time. -# See lib/devices/GPU.md for documentation on driver injection. +# IMPORTANT: NVIDIA guest drivers must be pre-installed in the image. +# hypeman does NOT inject drivers - the guest image is responsible for +# including matching NVIDIA drivers for vGPU/GPU passthrough. # -# This image demonstrates that standard CUDA runtime images work out of the box -# with hypeman's GPU passthrough - no driver version matching required. +# This image includes the NVIDIA driver userspace libraries from the CUDA +# runtime image plus nvidia-utils for nvidia-smi. FROM nvidia/cuda:12.4.1-runtime-ubuntu22.04 -# Install dependencies and Ollama -# Note: We use the runtime image (not devel) since we don't need CUDA compilation tools +# Install NVIDIA driver utilities and dependencies +# The nvidia-utils package provides nvidia-smi and matches the driver version +# needed for the vGPU profile assigned by hypeman. RUN apt-get update && \ - apt-get install -y curl ca-certificates python3 && \ + apt-get install -y curl ca-certificates python3 nvidia-utils-550 && \ curl -fsSL https://ollama.com/install.sh | sh && \ rm -rf /var/lib/apt/lists/* @@ -21,7 +22,7 @@ COPY test-nvml.py /usr/local/bin/test-nvml.py COPY test-cuda.py /usr/local/bin/test-cuda.py RUN chmod +x /usr/local/bin/test-nvml.py /usr/local/bin/test-cuda.py -# Ensure libraries are in the path (hypeman injects to /usr/lib/x86_64-linux-gnu) +# Ensure libraries are in the path ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH} ENV PATH=/usr/local/cuda/bin:/usr/bin:${PATH} diff --git a/lib/devices/types.go b/lib/devices/types.go index ca7b68ed..bd66fa86 100644 --- a/lib/devices/types.go +++ b/lib/devices/types.go @@ -15,15 +15,15 @@ const ( // Device represents a registered PCI device for passthrough type Device struct { - Id string `json:"id"` // cuid2 identifier - Name string `json:"name"` // user-provided globally unique name - Type DeviceType `json:"type"` // gpu or pci - PCIAddress string `json:"pci_address"` // e.g., "0000:a2:00.0" - VendorID string `json:"vendor_id"` // e.g., "10de" - DeviceID string `json:"device_id"` // e.g., "27b8" - IOMMUGroup int `json:"iommu_group"` // IOMMU group number + Id string `json:"id"` // cuid2 identifier + Name string `json:"name"` // user-provided globally unique name + Type DeviceType `json:"type"` // gpu or pci + PCIAddress string `json:"pci_address"` // e.g., "0000:a2:00.0" + VendorID string `json:"vendor_id"` // e.g., "10de" + DeviceID string `json:"device_id"` // e.g., "27b8" + IOMMUGroup int `json:"iommu_group"` // IOMMU group number BoundToVFIO bool `json:"bound_to_vfio"` // whether device is bound to vfio-pci - AttachedTo *string `json:"attached_to"` // instance ID if attached, nil otherwise + AttachedTo *string `json:"attached_to"` // instance ID if attached, nil otherwise CreatedAt time.Time `json:"created_at"` } @@ -53,4 +53,44 @@ func ValidateDeviceName(name string) bool { return DeviceNamePattern.MatchString(name) } +// GPUMode represents the host's GPU configuration mode +type GPUMode string +const ( + // GPUModePassthrough indicates whole GPU VFIO passthrough + GPUModePassthrough GPUMode = "passthrough" + // GPUModeVGPU indicates SR-IOV + mdev based vGPU + GPUModeVGPU GPUMode = "vgpu" + // GPUModeNone indicates no GPU available + GPUModeNone GPUMode = "none" +) + +// VirtualFunction represents an SR-IOV Virtual Function for vGPU +type VirtualFunction struct { + PCIAddress string `json:"pci_address"` // e.g., "0000:82:00.4" + ParentGPU string `json:"parent_gpu"` // e.g., "0000:82:00.0" + HasMdev bool `json:"has_mdev"` // true if an mdev is created on this VF +} + +// MdevDevice represents an active mediated device (vGPU instance) +type MdevDevice struct { + UUID string `json:"uuid"` // e.g., "aa618089-8b16-4d01-a136-25a0f3c73123" + VFAddress string `json:"vf_address"` // VF this mdev resides on + ProfileType string `json:"profile_type"` // internal type name, e.g., "nvidia-556" + ProfileName string `json:"profile_name"` // user-facing name, e.g., "L40S-1Q" + SysfsPath string `json:"sysfs_path"` // path for VMM device attachment + InstanceID string `json:"instance_id"` // instance this mdev is attached to +} + +// GPUProfile describes an available vGPU profile type +type GPUProfile struct { + Name string `json:"name"` // user-facing name, e.g., "L40S-1Q" + FramebufferMB int `json:"framebuffer_mb"` // frame buffer size in MB + Available int `json:"available"` // number of VFs that can create this profile +} + +// PassthroughDevice describes a physical GPU available for passthrough +type PassthroughDevice struct { + Name string `json:"name"` // GPU name, e.g., "NVIDIA L40S" + Available bool `json:"available"` // true if not attached to an instance +} diff --git a/lib/hypervisor/qemu/config.go b/lib/hypervisor/qemu/config.go index 57c539aa..2f5dd69b 100644 --- a/lib/hypervisor/qemu/config.go +++ b/lib/hypervisor/qemu/config.go @@ -4,6 +4,7 @@ import ( "fmt" "runtime" "strconv" + "strings" "github.com/onkernel/hypeman/lib/hypervisor" ) @@ -64,9 +65,23 @@ func BuildArgs(cfg hypervisor.VMConfig) []string { args = append(args, "-device", fmt.Sprintf("vhost-vsock-pci,guest-cid=%d", cfg.VsockCID)) } - // PCI device passthrough (GPU, etc.) - for _, pciAddr := range cfg.PCIDevices { - args = append(args, "-device", fmt.Sprintf("vfio-pci,host=%s", pciAddr)) + // PCI device passthrough (GPU, mdev vGPU, etc.) + for _, devicePath := range cfg.PCIDevices { + var deviceArg string + if strings.HasPrefix(devicePath, "/sys/bus/mdev/devices/") { + // mdev device (vGPU) - use sysfsdev parameter + deviceArg = fmt.Sprintf("vfio-pci,sysfsdev=%s", devicePath) + } else if strings.HasPrefix(devicePath, "/sys/bus/pci/devices/") { + // Full sysfs path for regular PCI device - extract the PCI address + // Path format: /sys/bus/pci/devices/0000:82:00.4/ + parts := strings.Split(strings.TrimSuffix(devicePath, "/"), "/") + pciAddr := parts[len(parts)-1] + deviceArg = fmt.Sprintf("vfio-pci,host=%s", pciAddr) + } else { + // Raw PCI address (e.g., "0000:82:00.4") + deviceArg = fmt.Sprintf("vfio-pci,host=%s", devicePath) + } + args = append(args, "-device", deviceArg) } // Serial console output to file diff --git a/lib/instances/configdisk.go b/lib/instances/configdisk.go index c0e4e5b7..a8f1bc87 100644 --- a/lib/instances/configdisk.go +++ b/lib/instances/configdisk.go @@ -9,7 +9,6 @@ import ( "strconv" "strings" - "github.com/onkernel/hypeman/lib/devices" "github.com/onkernel/hypeman/lib/images" "github.com/onkernel/hypeman/lib/network" "github.com/onkernel/hypeman/lib/vmconfig" @@ -69,15 +68,6 @@ func (m *manager) buildGuestConfig(ctx context.Context, inst *Instance, imageInf cfg.GuestDNS = netConfig.DNS } - // GPU passthrough - check if any attached device is a GPU - for _, deviceID := range inst.Devices { - device, err := m.deviceManager.GetDevice(ctx, deviceID) - if err == nil && device.Type == devices.DeviceTypeGPU { - cfg.HasGPU = true - break - } - } - // Volume mounts // Volumes are attached as /dev/vdd, /dev/vde, etc. (after vda=rootfs, vdb=overlay, vdc=config) deviceIdx := 0 diff --git a/lib/instances/create.go b/lib/instances/create.go index 194a3a9b..6efee14e 100644 --- a/lib/instances/create.go +++ b/lib/instances/create.go @@ -237,6 +237,8 @@ func (m *manager) createInstance( // whatever devices have been attached when cleanup runs. var attachedDeviceIDs []string var resolvedDeviceIDs []string + var gpuProfile string + var gpuMdevUUID string // Setup cleanup stack early so device attachment errors trigger cleanup cu := cleanup.Make(func() { @@ -255,6 +257,25 @@ func (m *manager) createInstance( }) } + // Handle vGPU profile request - create mdev device + if req.GPU != nil && req.GPU.Profile != "" { + log.InfoContext(ctx, "creating vGPU mdev", "instance_id", id, "profile", req.GPU.Profile) + mdev, err := devices.CreateMdev(ctx, req.GPU.Profile, id) + if err != nil { + log.ErrorContext(ctx, "failed to create mdev", "profile", req.GPU.Profile, "error", err) + return nil, fmt.Errorf("create vGPU mdev for profile %s: %w", req.GPU.Profile, err) + } + gpuProfile = req.GPU.Profile + gpuMdevUUID = mdev.UUID + log.InfoContext(ctx, "created vGPU mdev", "instance_id", id, "profile", gpuProfile, "uuid", gpuMdevUUID) + + // Add mdev cleanup to stack + cu.Add(func() { + log.DebugContext(ctx, "destroying mdev on cleanup", "instance_id", id, "uuid", gpuMdevUUID) + devices.DestroyMdev(ctx, gpuMdevUUID) + }) + } + if len(req.Devices) > 0 && m.deviceManager != nil { for _, deviceRef := range req.Devices { device, err := m.deviceManager.GetDevice(ctx, deviceRef) @@ -310,6 +331,8 @@ func (m *manager) createInstance( VsockCID: vsockCID, VsockSocket: vsockSocket, Devices: resolvedDeviceIDs, + GPUProfile: gpuProfile, + GPUMdevUUID: gpuMdevUUID, } // 12. Ensure directories @@ -673,6 +696,12 @@ func (m *manager) buildHypervisorConfig(ctx context.Context, inst *Instance, ima } } + // Add vGPU mdev device if configured + if inst.GPUMdevUUID != "" { + mdevPath := filepath.Join("/sys/bus/mdev/devices", inst.GPUMdevUUID) + pciDevices = append(pciDevices, mdevPath) + } + // Build topology if available var topology *hypervisor.CPUTopology if hostTopo := calculateGuestTopology(inst.Vcpus, m.hostTopology); hostTopo != nil { diff --git a/lib/instances/delete.go b/lib/instances/delete.go index 5840e7f1..c4cfe3f4 100644 --- a/lib/instances/delete.go +++ b/lib/instances/delete.go @@ -7,6 +7,7 @@ import ( "syscall" "time" + "github.com/onkernel/hypeman/lib/devices" "github.com/onkernel/hypeman/lib/guest" "github.com/onkernel/hypeman/lib/hypervisor" "github.com/onkernel/hypeman/lib/logger" @@ -94,6 +95,15 @@ func (m *manager) deleteInstance( } } + // 6c. Destroy vGPU mdev device if present + if inst.GPUMdevUUID != "" { + log.InfoContext(ctx, "destroying vGPU mdev", "instance_id", id, "uuid", inst.GPUMdevUUID) + if err := devices.DestroyMdev(ctx, inst.GPUMdevUUID); err != nil { + // Log error but continue with cleanup + log.WarnContext(ctx, "failed to destroy mdev, continuing with cleanup", "instance_id", id, "uuid", inst.GPUMdevUUID, "error", err) + } + } + // 7. Delete all instance data log.DebugContext(ctx, "deleting instance data", "instance_id", id) if err := m.deleteInstanceData(id); err != nil { diff --git a/lib/instances/types.go b/lib/instances/types.go index 0d0c954d..150b898e 100644 --- a/lib/instances/types.go +++ b/lib/instances/types.go @@ -76,6 +76,10 @@ type StoredMetadata struct { // Attached devices (GPU passthrough) Devices []string // Device IDs attached to this instance + + // GPU configuration (vGPU mode) + GPUProfile string // vGPU profile name (e.g., "L40S-1Q") + GPUMdevUUID string // mdev device UUID } // Instance represents a virtual machine instance with derived runtime state @@ -94,6 +98,11 @@ func (i *Instance) GetHypervisorType() string { return string(i.HypervisorType) } +// GPUConfig contains GPU configuration for instance creation +type GPUConfig struct { + Profile string // vGPU profile name (e.g., "L40S-1Q") +} + // CreateInstanceRequest is the domain request for creating an instance type CreateInstanceRequest struct { Name string // Required @@ -110,6 +119,7 @@ type CreateInstanceRequest struct { Devices []string // Device IDs or names to attach (GPU passthrough) Volumes []VolumeAttachment // Volumes to attach at creation time Hypervisor hypervisor.Type // Optional: hypervisor type (defaults to config) + GPU *GPUConfig // Optional: vGPU configuration } // AttachVolumeRequest is the domain request for attaching a volume (used for API compatibility) diff --git a/lib/oapi/oapi.go b/lib/oapi/oapi.go index 450b9030..7e38e840 100644 --- a/lib/oapi/oapi.go +++ b/lib/oapi/oapi.go @@ -41,6 +41,12 @@ const ( Pci DeviceType = "pci" ) +// Defines values for GPUResourceStatusMode. +const ( + Passthrough GPUResourceStatusMode = "passthrough" + Vgpu GPUResourceStatusMode = "vgpu" +) + // Defines values for HealthStatus. const ( Ok HealthStatus = "ok" @@ -147,6 +153,9 @@ type CreateInstanceRequest struct { // Env Environment variables Env *map[string]string `json:"env,omitempty"` + // Gpu GPU configuration for the instance + Gpu *GPUConfig `json:"gpu,omitempty"` + // HotplugSize Additional memory for hotplug (human-readable format like "3GB", "1G") HotplugSize *string `json:"hotplug_size,omitempty"` @@ -274,6 +283,45 @@ type ErrorDetail struct { Message *string `json:"message,omitempty"` } +// GPUConfig GPU configuration for the instance +type GPUConfig struct { + // Profile vGPU profile name (e.g., "L40S-1Q"). Only used in vGPU mode. + Profile *string `json:"profile,omitempty"` +} + +// GPUProfile Available vGPU profile +type GPUProfile struct { + // Available Number of instances that can be created with this profile + Available int `json:"available"` + + // FramebufferMb Frame buffer size in MB + FramebufferMb int `json:"framebuffer_mb"` + + // Name Profile name (user-facing) + Name string `json:"name"` +} + +// GPUResourceStatus GPU resource status. Null if no GPUs available. +type GPUResourceStatus struct { + // Devices Physical GPUs (only in passthrough mode) + Devices *[]PassthroughDevice `json:"devices,omitempty"` + + // Mode GPU mode (vgpu for SR-IOV/mdev, passthrough for whole GPU) + Mode GPUResourceStatusMode `json:"mode"` + + // Profiles Available vGPU profiles (only in vGPU mode) + Profiles *[]GPUProfile `json:"profiles,omitempty"` + + // TotalSlots Total slots (VFs for vGPU, physical GPUs for passthrough) + TotalSlots int `json:"total_slots"` + + // UsedSlots Slots currently in use + UsedSlots int `json:"used_slots"` +} + +// GPUResourceStatusMode GPU mode (vgpu for SR-IOV/mdev, passthrough for whole GPU) +type GPUResourceStatusMode string + // Health defines model for Health. type Health struct { Status HealthStatus `json:"status"` @@ -387,6 +435,9 @@ type Instance struct { // Env Environment variables Env *map[string]string `json:"env,omitempty"` + // Gpu GPU information attached to the instance + Gpu *InstanceGPU `json:"gpu,omitempty"` + // HasSnapshot Whether a snapshot exists for this instance HasSnapshot *bool `json:"has_snapshot,omitempty"` @@ -461,6 +512,15 @@ type Instance struct { // InstanceHypervisor Hypervisor running this instance type InstanceHypervisor string +// InstanceGPU GPU information attached to the instance +type InstanceGPU struct { + // MdevUuid mdev device UUID + MdevUuid *string `json:"mdev_uuid,omitempty"` + + // Profile vGPU profile name + Profile *string `json:"profile,omitempty"` +} + // InstanceState Instance state: // - Created: VMM created but not started (Cloud Hypervisor native) // - Running: VM is actively running (Cloud Hypervisor native) @@ -471,6 +531,15 @@ type InstanceHypervisor string // - Unknown: Failed to determine state (see state_error for details) type InstanceState string +// PassthroughDevice Physical GPU available for passthrough +type PassthroughDevice struct { + // Available Whether this GPU is available (not attached to an instance) + Available bool `json:"available"` + + // Name GPU name + Name string `json:"name"` +} + // PathInfo defines model for PathInfo. type PathInfo struct { // Error Error message if stat failed (e.g., permission denied). Only set when exists is false due to an error rather than the path not existing. @@ -552,8 +621,11 @@ type Resources struct { Cpu ResourceStatus `json:"cpu"` Disk ResourceStatus `json:"disk"` DiskBreakdown *DiskBreakdown `json:"disk_breakdown,omitempty"` - Memory ResourceStatus `json:"memory"` - Network ResourceStatus `json:"network"` + + // Gpu GPU resource status. Null if no GPUs available. + Gpu *GPUResourceStatus `json:"gpu"` + Memory ResourceStatus `json:"memory"` + Network ResourceStatus `json:"network"` } // Volume defines model for Volume. @@ -8577,126 +8649,134 @@ func (sh *strictHandler) GetVolume(w http.ResponseWriter, r *http.Request, id st // Base64 encoded, gzipped, json marshaled Swagger object var swaggerSpec = []string{ - "H4sIAAAAAAAC/+x9+3LbuNX4q2D4a6dyK8my7GQddTq/cewk606ceOLE+7XrfApEQhI2JMAAoGxtxv/2", - "AfqIfZJvcADwJlCmk1iJm3Q6s4pJAOeGc8PB4ccg5EnKGWFKBqOPgQznJMHw80ApHM7PeZwl5BX5kBGp", - "9J9TwVMiFCXwUsIzpsYpVnP9r4jIUNBUUc6CUXCK1RxdzokgaAGzIDnnWRyhCUEwjkRBNyBXOEljEoyC", - "7YSp7QgrHHQDtUz1n6QSlM2C624gCI44i5dmmSnOYhWMpjiWpFtb9kRPjbBEekgPxuTzTTiPCWbBNcz4", - "IaOCRMHo1zIab/OX+eQ3Eiq9+MEC0xhPYnJEFjQkq2QIMyEIU+NI0AURq6Q4NM/jJZrwjEXIvIc6LItj", - "RKeIcUa2KsRgCxpRTQn9il46GCmREQ9lIoBpTCMPBw6PkXmMjo9QZ06uqosMf5rsB81TMpyQ1Ul/zhLM", - "epq4Giw3P7xbnvv5nm9mypMkG88Ez9LVmY9fnpy8QfAQsSyZEFGecX+Yz0eZIjMi9IRpSMc4igSR0o+/", - "e1iGbTAYDEZ4OBoM+gMflAvCIi4aSWoe+0m6M4jImilbkdTOv0LSF+fHR8cH6JCLlAsMY1dWqgl2mTxl", - "vMpiU+WKT/4PBcHKCn+jKvCj9hJ+4BjNYj7BcbxEGaMfsorc9NGx3gIKpYIvaESiLsLwAFGJcKZ4b0YY", - "EViRCE0FT5CaE1TiLeqQ/qzfRRca3Z5mbg8Pe4NBb3ARVLkT7/VmaRZ0gxQrRYQG8H9/xb3fD3r/HPQe", - "vS1+jvu9t3/5g4+RbQUO8SnAafHsOK50kQO2LIV1QNdL6BomN7PvOMGzW3Pv8BhRPQ4JMiWCMI2JgT/i", - "4Xsi+pRvx3QisFhusxllV6MYKyJVFZv1796IH8C2BjE206jfErXangNx68T8kogQS4JiogVEdlFEZ1TJ", - "LsJabWM5JxJpm/JXFGKmZVYqLBTiAhEWoUuq5gjDe1UKJMseTmmPGlCDbpDgq+eEzbTdfLi7Io9aGDv2", - "R+/tn92ftv6/VyRFFhOPML7imaJshuAxmnKB1JxKVMBAFUlg3B8EmQaj4P9tF87AtvUEth11s5jotRLK", - "js2wnRwSLARe+rnmgFvHPakwW6NXzAby4HfkLJtEVltKpDjC4LcAvs9O32zrLZliKdVc8Gw2L3PlV6cP", - "3pZosULdKpLdIKLy/Zjy8ST1wUTle3S8/RJpbYVimlBVaKedweDk8ba8CPQ/Hrh/bPXRkXFoAHyNPBdW", - "aco5FgRNsCQR4gwdnr5BOI55CMpfOw4hZ1M6ywSJ+jUzBLP7pIWwhYYbRxE1q5xWyO1xBsoIPmELKjhL", - "CFNogQXVm6diXD8GL14ePRk/eXEejDQnoyy0lur05avXwSjYHQwGJboW8jDnKo2z2VjS30nFzQt2nz0O", - "6oAc5PCjhCRcLIHjdg7UmVe395SLBCsU0/cEXej5DBN2ntUV7xCWWiHCfJkSsaCSe5y7n/Nnmn+ZJOW9", - "ZoS7ymJJhPb+HO+AmcA+liVaKsOYZ1GvtGQ3+EASENMCUM9Lq96WVt2ttPoN6hrHKWWkUV93vxUde8nF", - "+5jjqLfzhVUsI0rPvYriC/OgykwrACTnv/Y4Krtsgll0SSM1H0f8kmmQPbrEPkH5y7lCudKY4Pg///r3", - "+UnhUOw8m6RWu+wMH3ymdqnpEz21jzAFIlnqR+NN6kfi/OQ///q3w+TrIkGYls+oonRMtFVF5Zc5UXMi", - "SlbGMVj/yXh7MBw5eSktXwnfyoHoiiLkCyJivPQowp2BRxP+IqiC/WXHIW2hkB58gxrUszljtKoIB35N", - "6AHKA9Njvb+tXm4DSQ7IzvDE/hy21c2LMM1kBaRhHZwXEE1ql3xBhcpwrOWkYra8waVJW3jMvMmKlN0N", - "y/9cHrBCofZvtDZQFDyhVu6WmRlyGKvOh9/DMlq+2cO6IYXji3LzqC3MpOIJohFhik4pEahTC8hoNXSr", - "cmzB416EFQZ93NJoGHBXo99kaaYyTGkSzfFssjrlmZZAytCMzvBkqaoOy85glfV+Qrv5faRuygwZ8SDR", - "WHFPwsNJy/GRpqN7t03WB/JIY8XHiyn1zJxrqiICpRKFtTSUFVo9RS8NqU1LddHlnGrdJpEjAhi085Oy", - "I92/YD2kgRuho3yBfNp8Sm3S9aY37lmHixIQlIG3NFluIYzOT/rodQ7tnyRiWNEFcamyOZZoQghDGdhE", - "EsH6kAAsA5BJHfFQVR9ufXCTVduCeIHbZ32kHbgEM3RJ4xjyDQlWNIRkxYTW8LmcE2YZpVfSCoAVbt4F", - "K0uWTU/WVX43AM1AojFWnqiNzKhUotAcUuEkRZ1XTw93d3cf1ZX08EFvsNPbefB6ZzAa6P//M+gGRrlq", - "Tw4r0rPqZxOJQ99cB1V9YdM/ZY1y+Ob4aGgtQnUd9fsefrR/dYXVo4f0Uj76PZmI2W+7eCOpRb96Oiry", - "VqiTSSJ6TvVpqfJlq0pJoYZs1CcnmW6V1TR/WG9+DHav9Zt3kQet6VVIPsIr3U/IVNaVYGVfNevo15YM", - "VXz0X7V/UEh+KSCzOcOQlqYt6Kpj/seC4PfalffYV22e5djYHX/CINPO62SJyJX2a0mEBOdqKk2QVnVT", - "dvZ+2tvffbi3PxiU9jll6mEp314SYh7ScaitSisAdGQY46UOTvUY1AHvOkKTmE+qwvtg9+H+T4NHO8O2", - "cBjftB0dci/KjUIdS5G/uKMk96QC1HD408Pd3d3Bw4fDvVZQWQevFVDOGay4Dj/t/rS3sz/ca0UFn6//", - "RAiTU6gdKPHII6QHaRpTE9n0ZEpCOqUhInoGpAegTgJmieRudnVPTnA0FtYN9NoDhWnsIUMp1WIWs2+i", - "jrbpSRYrmsbEPAOGtPJ0AfMjmMmXZqOMETEmjjy3mCkhUnrzHrV0hMMlfwVclIhMstlMk6RMuhMqwbMo", - "HCJK4mhkduiNeg64WQD2tkkOLA4tpeE5vySiF5MFictCYMyRBjbhgqBcTgzTKlhRtsAxjcaUpZlXJBpJ", - "+TQT4F+aSRGe8EyBL2kYVl4Ezh0gRphqde0l1go5fiY4NmfKVUpIhZUJ9pxu5u+reTH+/kZ22El8bDh2", - "GbMaAxKPCTw8OTIGPuRMYcqIQAlR2J5gl7LMcNgRdIOelqkIk4QzxKfTv67POzeEAPkGWedEHpZDz7tz", - "IOnMRpR1F1byeEEilGBGp0QqZN8sryznePjg4QhPwp3hbkSmew8e9vt9f3ZGiWXKKfMs9SR/1o4V2ya3", - "2Svm7Mv55/HhDvLpbXD5GJwevP45GAXbmRTbMQ9xvC0nlI1K/87/WTyAH+afE8q8efhc59YgBRVjNYIO", - "V8020lHXFNO4VsyRZnFs/z7SmDAS5gLJQdncGOL6/e8XWjRj+juJkPd0UuGZdsSNxH3eMWQ3+JCRjIxT", - "LqlZfcULtk90KDvJaBwhGFEu7FDmT9XEyLAR/ZJzAumGdZ6JtGkNeMeumTFFYxNx+/01j5/SApRc7dbS", - "fICzfVr4yylhkbGgWgzMr5Czhd4V8A+AT+sZIzgVBe6erTDjkov3lM3GEfVI5y/mIYqoIKGC46Cb91Cw", - "jdP0ZlH0Z4NynZajf0P4YU9SPdblq2vyT4nbq6u/nP39w//I059+2/nw/Pz8H4tnfz96Qf9xHp++/Kzj", - "ovWH6F/1JHxtahaC1coJeFvxOMEq9Dg+cy5VA9XsE6Q4SvTgPjrEDE3I6IL10HOqiMDxCF0EOKV9S8x+", - "yJOLAHXIFQ6VGYU4Q3oqNCc4ImJLDz41R2Z68EcXk13X54iWDCc0RMISOT+Kkdkk4gmmbOuCXTA7F3KI", - "SMj96V8RCnGqMkE0R1CYiXiJJgKHJC/sKRbvoo84Ta+3LpiaY4XIlRIagxQLlVfcuBWA0RYqk1u0r5MI", - "LXCcEYlCINQFy+1HpEHQkygsZkT180gU/P1afq+BKN6EDheqckSxP+h6+Ij0e5qRMZWKMJQfJVIJwos6", - "7oBpf1DZ/vuD/ZvT2LkMrRE/kO7VMk8nlC32hxFgWNoo4/FcqfTmuk3QN2aPoJ9fvz7VZND/PUNuooIW", - "OYs7nMVLhHVcTKRJzqoYfBJ7prcV+BKwhrstEXptXtbDYnkzHk9gYfT6+RlSRCSUGf3dCTU5pzp8JyZN", - "SKXMtChSjA4OT55s9VvUqQJtc/jX8PF1jmEtG+VOpFczYDCiyL1o+nbR8VFXu1N2hxaOFqTfn3KBYqNg", - "in09Qm8kqR6GAatMptBwMl4WVTFGq18EW27GtK4pRuhV7t/hHJS8ErAQBjdlsS9h2gv2ixYMczawMnu3", - "Ciucetj4xao2OAnACtncCZjiZlWwfvt7KA57nrN6wcDt9na50kAv5heNgvd37oHs3jaWvG1VVfVAuVRA", - "kBdWfd2KqNX6JizHkuFUzrlqPrHDyL2DyBWVSq5WE7U6Y1qtpqoaG1MnteaI/kvWRYmMMTguq6PxxSue", - "vuYB1LdXbbW2Pupzi5ysu3VHNU6N29tXH1Td6ebPX7Za6U7AqdQd+ZRB2Sq56oBPLjXqBtRzMnogJZ0x", - "EqHj06KevkhfuOlrOD0a9nce7vd3BoP+zqBNMifB4Zq1Tw4O2y8+GJrwdoQnozAakelnJJOsYBv3AceX", - "eCnRhXPwLgLjUZZcydK2tU5gq3T2akXXpxVw1U3aTSVatynJaqXvoRKzwVE4gyrN23sJDxq9hBu5KhVW", - "5GY33myiM3jZjRrfJs1JUMizOGJ/Umiid55x7Elk4w9JlJEU8y6V6A17z/glq6Jusl16/37IiFii85OT", - "Sm5UkGkm25X/SMXTtJEPPL0VG4Y3OGs3QlOqwNtE1V1dE5Ys0BevsSsnctxhn5G6Fgmdstw1F3/BdJCs", - "MfV60UhLBrKzo0mmUF4ErUXuUPtBqORdmVIniJ9eGUdLzwA2I9RP4mXugK0dfIq1+LmxKfxr/Yizeaa0", - "cYcxcp4ppP8FIGsUrAO7fgojySP0gsMYC2lXq/+aJ2xexyyaLFdfr3vNHZPb0cGc4oJEsJjdliP0NN+K", - "+Wa2m7cjif1pNIQ9FIYD7y0T9Fmn1XIr6AaW6kE3MCQMuoGjjP5pMIRfAHzQDSwg3nqSU6zmx2zKV8O0", - "26gse/jiguJUIymlVjMRYZREW330sqK7LN3gOCeWBEUZseVthg4C24pCbELVFKs5CCYMpGxWLbZeWbCN", - "IjEwrC9nhHXti218Huk/MHgtMqCVCUkkwsXRQav4isrxlMakzcSCzLIYCwTvtwNZLpOYsvdtZpfLZMJj", - "GiI9oG6QpjyO+eVYP5J/A1y2WmGnB4yLLFnNwBjgbI7UMKS2boHC3zSWW7VTl1Bbg20zfhuuXbdxIb1V", - "EE9pTFAC1S9vGL0qCXq1NmhvOGg6ZGuYtHK8Vi37aVVeVLMkVmR9luIVkTwTITnILy54UjRptgrnQhtU", - "d9+hegK758MWsizrjhTzqUrnis4rdbVXVbqWaqBalVw5Y+2tKsxtYsMxk6kW8Abfblq/o39cTkXWo+JF", - "4q990a5yE7VOjCO9Sq9K5u7B/qNHu3sPHg1bkcZGO3m43JAMawqZHQTbkoS1O0JVjg0fDOB/twLKBMx+", - "kBqC5ipAlfs+nwzQ9Zrtc5afUtdq8PP9saYxQ8FJYaersHJvvxW1sOsU4Ql23SPwKErXODtkOiXgqI0N", - "3XoFMLVDnlYwhDjFIVVLzzkovoS8N8pfKc3+sF3JZA1YD0nt3AhPlfb+F0TIbFIU2XXc4ujPCDJJNVnY", - "b11PKrPJGGbwJN3qq8J79qAoqoUgRdDDs0lcSk/bSnFtJkAifHnUy5yY6BLLSmyof4eKRN3SNd16EsG8", - "sa5ce7W2SoOCbNl0KT3qKyWv2SA7qMz+Gju7QdmaFOJcp/g6M9a8BbVVhlOoNmGaxyp66kStXWwzkdUP", - "1g5+2qjxpFzpvbaUvlIWnhuU2y9bSsveZmC9BhXEw8JgKVDM3a1wyMdcEzQ3XXBKXAOhWokqlUrH//bO", - "Dyq9jDokSdXSFUm5mH7rdkH8QT6hVza+8LHW4NGXKKx5s7aS5r/kylw5b+IWuTFjssLTxuNrv/d4VD+T", - "MGGSvTJQzaHXCqGl6jU7l+u6XJl2UxAD2dKRWVavdb1FZ6umqLfYOeaEumhtdVMw13BQbO7TlDArQdLM", - "G5M0+8w2YFS6/l+fSDIbkdxci2FSNDom7NXvlIAXdikohDiWQIawmgR51LoaGq/P5Z/gq3wFCCCxRLWr", - "zwaPUluQZ4/hivsrd7eATt0UAEb9Evvjz+uP5qRqlRnrGqa5tKx341n9s0ajNe2tmnAWa3TX92TTqouE", - "maBqeaYNgj1xJFgQcZAZMQRLAUjAn4vFoR7p+hqixqnHeXxGGBE0RAenxyAlCWZ4pll2foJiOiXhMoyJ", - "LSdZSW1Ce4qXh8c9UwfnTlzh/I8qIIi7b3pwegxX3YQ06w76wz60RuEpYTilwSjY7e/AZT5NBkBxu9QO", - "xyZn9EYEU3YcWZN7ZN/RxJUpZ9K8PxwMzG0SpqxyxcWFou3fpEk7GAPb2kuzF59X8+grhRLOGRBww5Vo", - "SXfIXHeDvcHOrYC78UKQD4Q3DGdqzgX9nUR60Qe3pMgnLXrMTMzr+r4Q+2IhwsHo16rw/vr2+m03kFmS", - "YO0xGtL56ZZy6ZGCcse2wOwxItVjHi2/GL6+pnDX1Q2ttdf1ihB+OT472Vulub2lW5DMiNgGuP0YR3mh", - "WMdesMoL4CpXgb+W0O8N9u5+0dIN8vzqF+Km7M4A8ejugTjkbBrTUKGeg8U2CkM4Nk0JqgJyX9TBKws1", - "wg6vKdRNFl3O9HTOVGxXklKNRqPW5HQz1qPeWfUWZiTHqnRn+4cluUl0jqgMtXNZlpZeiNNSL1dZ7NOy", - "FH2k0bXxlWJizrOrMnQEf89NTooFTogiQgJMDa37UNHnlOoH7iwIwlwTRFbNSbdEw7ov+XZFYvca+zdk", - "rG4bNqAUj2oK8SsqwlpBWamJyX2S5jc5F13ThuuuX8M9I+rbEs3B5rwgd7H/a4r5fZGoZ0S5LZKTTWvB", - "eX4hvUm87JX1O2S0XcGD+JmOPs2uNoCaQqYCLTMUhXMSvjcI2d4i6zyCY9d+5O79AHPv/hbW34L/w9y3", - "CBwLWq0LFo9tddvdxYqVDtStQsXhF4PACpiHyFCaP3EXmU2JHZZLFm59jZjxvzsqrPcCuUc76TSLY+hm", - "Zy+yF90Hyvp0+6P2D1r4yW63rfVF3rx63iMs5BGJ7BWSZofEXTb+st6yYZhB5YeYtImvgFROMJqd0c/g", - "vzk5KBrw/3H41N4c+uPwqbk79Mfdg6IP/90Iy2BTqnnT3us9Fj7tvNIq0UA1mSvBN3l7+Vsbcfhs74Xb", - "uHw5gD+8vjZeX5lcax2/vA3GHbp+1W90bPicIBc2H7Xhkbtt8Z25fJtNPVmJNEekUJlRycXbCwjwFQd7", - "o990CL5PW89WHNBc4sr6t2UOtdiQa70DJ7rHR13brMG0WEgFmdKrzWVUHRwb9xLtuptPpx4kEzrLeCbL", - "t8ihNweRRT/MigK+b/5rYZ4bPdhvWEoHmzQdG3dQf8j9HbnOdYYa5W2ORW5ynt1bm3Gei6Oa9t6zg/CH", - "99zKey6Ra733nF8+vkv3ufqRtI37z07efAS3dZXfowd9z7xSzGyOu3TYW9FxrR3UouXLettffN9k4wf9", - "+eKb90vdXcP7mEOC7hHwITbnCRa2ptkV/NbkYbBZ3bd5F/A+i9izcptBv7MFimg75rOy21VvcyIITop2", - "Yki/jbBEZwBY74wwhZ4sNFb9C+Z6Er4zl6TeoVxQzRcZYxIq+92jmMN3fSTMD3053uE0fZf33twaoWdQ", - "3lmirlm8I4mgOEYhZ5LHpr/Fu0WSvBut1omfn5zAIHhnbirC343ybxHle0zqty7YBXtFVCaYBCxiLBV6", - "gWLKiEQdzXDB49h8p+KdpmcJvy1ocKhnNM3t4uUF0yMoy4i0WFI2Q4xc2gnpFL0zjQPg/sE70+uwcdc/", - "11z6Sju/29xzxuCiOBJAONMvksAHCGBd6MFTLGw/jlAsld+i2Bl4rzt9XE10AU29JDV3YSmjSssHz5T5", - "4IIPEEN5PyiNV3xWvw0xQ+6iaEWUcZq2FV8LJkjxIknWyDDqFF35kFQRz9RfpIqIMG2ErXQ3CTfq4ND8", - "Q+H3pultpU+g6briI5W9leslVWBae7tmLeZfiyQJTNPCBPuar7SwJIpcqW2i1UrPkLWqU+sTrsZjmjMw", - "EHXOzp5s/bAZLd0SIFlV2VsCeiyH7foDN9W8wdsr88J377m49khfWQw3fxRRgoJC4zYWTZb2w4B5l9Z7", - "dScAGFlgBvbO4uXdI+5Z4x6x7aq++z1SyMd3vktCLqAPu3SdFO9P8VYp4iht9w40uSuax3Vd1Ht+crLV", - "tGlMK+/GLSN+hMO2jvK7tynQ9+/+7RbTyBXnCKxLFuoNoRpjdBezUmbaZehQw3wrDq+2TID+eHIpFUlM", - "wD7NYrjYBlXr8F3fqRtnagW68KVfLf5dSFmVer9dsAmZanuYEqHX1sP1/KXYwxfWnimcb99Tswe/jbgW", - "uihAKIdVE9VWPm3kGij4Yqe858Mng/QUAtVq/0GJOvBJdwBzIVGsf2ytjXRNc8Lbxbt3qeHy9pu+W61G", - "ZnNh/h403HFNrblWsvdOrT0j5c3i9A8w2qfWeLrOzPP0h5W3fW1/+MT30ieGg54cm85M4BAsrrSdi/3+", - "r20Buv3R/Di+6bhQ4XB+7tpPfRum1HaruWkZh+C92JQWp4iYK72b35M8byh0T69taMI5FCB1Uj749FsB", - "06jse5PuL1/jUqbjrSpcNrq33HX5b2ZvbdryWRhcuXaZHvdlmxtJc5goXgttRbmB6dqA1jW0hG66blje", - "CbZbbu9rPg+bB6hFI7q8k2j/guWtUxFlYZxFBB2evunaz7V04YMwZgbbsLOP/B1uJcKCuDa3F0xxFOI4", - "zGKsCMpbvZr2zLLhWPdVqf3xne23YhEPo/MetzLvgXqfYgy/TAD3yk1WQeJK3zpprC21nz3ZSGWpNWa3", - "qCt1GPwowWtRVVoilnMpfA0rJcJQO2Be76OzLE25UBKpSw7fOpBwlv/3s5cv0IRHyxHKxzFk2upaFWf7", - "ocqUhHRKSQQ9KfXYE6jWxgI+qpqUJnAjU0F6KU9BdUTmyo6lsXGPMFJY9Ge/IyzCOV0QjzIxc+b+0d2V", - "x9Zdh26QOPS2NXrQBbc6ae3jCgUsVX5UcTQZRPtNcfM9w7ypqZui1Bl4QhkWy7ZtgV/a+hwUZlLxxM17", - "fIQ6uPoFQ9syORV8QaN6S/FvpH/wCb6iSZbkH9F49hg+SSBMqQd8nAYKjZxMkauQkEhC5cfWLXsNr7YZ", - "trzw9A3daN2006aNPuVXrJkumhNqFmsf0wm54hzFWMzI1ndzM9HuteJi4vFR7VriPaz2XjjpK/yMlvXd", - "7ULalpHmXdR25+mOzVZ2n387UVipf9s9vF64yN3MppLyb0sEB5szCZsuJT+/x1k7HW0tamQzE+gZfQLz", - "nIc4RhFZkJin0E3fvBt0g0zEtjf4aHtbh2mxDuTg6+vB9dvr/wsAAP//kZMzgCqrAAA=", + "H4sIAAAAAAAC/+x963LbutXoq2B42qnc6m7HcdTpnHHsxFudOPGxY+/TbudTIBKSsE0CDADKVjL+2wfo", + "I/ZJvsGNN4ESlcRK3KTTma2YBLDWwrpjYfGT59MopgQRwb3BJ4/7MxRB9fNQCOjPrmiYROgcfUgQF/LP", + "MaMxYgIj9VJEEyJGMRQz+a8AcZ/hWGBKvIF3BsUM3M4QQ2CuZgF8RpMwAGME1DgUeE0P3cEoDpE38DoR", + "EZ0ACug1PbGI5Z+4YJhMvfumxxAMKAkXepkJTELhDSYw5KhZWvZUTg0gB3JIS41J5xtTGiJIvHs144cE", + "MxR4g9/yaLxLX6bj35Ev5OKHc4hDOA7RMZpjHy2TwU8YQ0SMAobniC2T4kg/DxdgTBMSAP0eaJAkDAGe", + "AEIJ2ikQg8xxgCUl5CtyaW8gWIIclAkUTCMcOHbgaAj0YzA8Bo0Zuisu0n86PvCqpyQwQsuT/pJEkLQk", + "cSVYdn71bn7uV3uumTGNomQ0ZTSJl2cevjk9vQTqISBJNEYsP+NBP50PE4GmiMkJYx+PYBAwxLkbf/sw", + "D1u32+0OYH/Q7ba7LijniASUVZJUP3aTtNcN0Iopa5HUzL9E0tdXw+PhITiiLKYMqrFLK5UYO0+ePF55", + "tinuiov/jxiCwjB/pSpwo/ZG/YAhmIZ0DMNwARKCPyQFvmmDoRQBAWJG5zhAQRNA9QBgDmAiaGuKCGJQ", + "oABMGI2AmCGQ21vQQO1puwmuJbotubkt2G91u63utVfcnXCvNY0Tr+nFUAjEJID/8xtsfTxs/bPbevYu", + "+zlqt9795Q+ujazLcIBOFJwGz4bdlSawwOa5sAzoag5dscnV2zeM4HTj3TsaAizHAYYmiCEiMdHwB9S/", + "QayNaSfEYwbZokOmmNwNQigQF0VsVr+7Fj8F2wrEyFSiviFqJZlT7NYI6S1iPuQIhEgyCG+CAE+x4E0A", + "pdqGfIY4kDblr8CHRPIsF5AJQBlAJAC3WMwAVO8VKRAtWjDGLaxB9ZpeBO9eITKVdnN/d4kfJTM2zI/W", + "uz/bP+38XydLsiREDmY8p4nAZArUYzChDIgZ5iCDAQsUqXF/YGjiDbz/08mcgY7xBDqWukmI5FoRJkM9", + "rJdCAhmDC/euWeBW7R4XkKzQK1qAHPgdW8vGgdGWHAgKoPJbFL4nZ5cdKZIx5FzMGE2ms/yu/Gb1wbsc", + "LZaoW0Sy6QWY34wwHY1jF0yY34Bh5w2Q2gqEOMIi0069bvf0eYdfe/IfT+w/dtrgWDs0CnyJPGVGafIZ", + "ZAiMIUcBoAQcnV0CGIbUV8pfOg4+JRM8TRgK2iUzpGZ3cQsicwk3DAKsVzkrkNvhDOQRfEHmmFESISLA", + "HDIshadgXD95r98cvxi9eH3lDeROBolvLNXZm/O33sDb7Xa7Obpm/CB3Yg0znpxdHimM5fszKuIwmY44", + "/ogKbqG3e/LcKwN+mOILIhRRtlAcYuYAjVlRHUwoi6AAIb5B4FrOpzetd1JW1H211BLRZosYsTnm1OEM", + "/pI+k/udcJSXTS0MRZbgiElv0e612ny13SSJJBf7IU2CVm7JpvcBRYqtM0AdLy17Z1LV17ICa9Q7DGNM", + "UKV+b34vOvmWspuQwqDV+8oqmSAh515G8bV+UNxMwwAo3X/poRSkcgxJcIsDMRsF9JZIkB26xzwB6cup", + "ArqTmMDwP//699Vp5oD0Tsax0Ua9/pMv1EYl/SOndhEmQySJ3Whcxm4krk7/869/W0y+LRKISP4MCkpH", + "R2dFVH6dITFDLGeV7AbLP2nvUA0Hll9yyxfCvXzguqQ46RyxEC4cirDXdWjCXxkWSr7MOCAtGpCD16hB", + "OZs1XsuKsOvWhA6gHDA9l/Jt9HIdSFJAev1T87NfVzfP/TjhBZD6ZXBeq+hTuvBzzEQCQ8knBTPnDEZ1", + "msPhFugsSt49Mfuf8gMUwJf+kNQGAivPqZZ7pmdWOY9lZ8XtkWktX+2RrUn5uKLiNMrzEy5oBHCAiMAT", + "jBholAI4XAz1ijs2p2ErgAIqfVzTaGhwl6PlaKGn0ptSxZqj6Xh5ygvJgZiAKZ7C8UIUHZxed3nr3YS2", + "87tIXZVJ0uyBgpGgjgSJ5ZbhsaSjfbdOlkjlnUaCjuYT7Jg51VRZxIo58EtpK8O0copW7GOTxmqC2xmW", + "uo0DSwRl0K5O8453+5q0gARuAI7TBdJp0ymlSZdCr92zBmU5IDBR3tJ4sQMguDptg7cptH/igECB58im", + "1maQgzFCBCTKJqJAra8ShnkAEi4jJCzKw43PrrNwOyq+oOZZG0gHLoIE3OIwVPmJCArsq+TGGJfwuZ0h", + "YjZKriQVAMncvGuS5yyTziyr/KanNAMKRlA4ojw0xVywTHNwAaMYNM5fHu3u7j4rK+n+k1a31+o9edvr", + "Drry///0mp5WrtKTgwK1jPrZRqLRNddhUV+YdFFeoxxdDo/7xiIU1xEf9+Czg7s7KJ7t41v+7GM0ZtPf", + "d+FWUpFu9XSc5blAI+GItazqk1zlym7lkkgV2avPTkptlAXVf1htfjR2b+WbD5E3LelVlaxUrzQ/I7NZ", + "VoIFuarW0W8NGYr4yL9K/yDj/FxAZnKMPs5Nm9H1GPOb5wzBG+nKO+yrNM98pO2OO8GQSOd1vADoTvq1", + "KACMUjHhOkgruim9vad7B7v7ewfdbk7OMRH7ufx8jompj0e+tCq1AJCRYQgXMjiVY0BDedcBGId0XGTe", + "J7v7B0+7z3r9unBo37QeHVIvyo4CDUORv9ijJ/ukAFS//3R/d3e3u7/f36sFlXHwagFlncGC6/B09+le", + "76C/V4sKLl//BWM6p1A6gKKBg0kP4zjEOrJp8Rj5eIJ9gOQMQA4AjUiZJZS62UWZHMNgxIwb6LQHAuLQ", + "QYZcqkUvZt4EDWnToyQUOA6RfqY2pJanqzA/VjO50nKYEMRGyJJng5kixLkz71FKR1hc0leUixKgcTKd", + "SpLkSXeKufIsMocIozAYaAldq+fUbmaAvaviA4NDTW54RW8Ra4VojsI8E2hzJIGNKEMg5RO9aQWsMJnD", + "EAcjTOLEyRKVpHyZMOVf6kkBHNNEKF9Sb1h+EXVOoWKEiVTXTmItkSNLDC4tfXJ2uWm2JWZ0gkMHGnM5", + "mXlqTLrNQ7za6160ev9PJR/ekHCh9QAmQI2JaIDapfNR9X5t9M6qYEoPp0EeuiWcoH3NkZNKo11LEQ7E", + "TEakkIAxAsZM6kyaylNmi2QK/plLYU4YjNA4mUwQG0WOSOulfA70CzrwxwScPi8qTamc67pbZ4XNUf7W", + "BPqYTHdqU98RyZXQaOao+c69XeeI04T56EJAkXA3VzLzDuDqpTZ4nZYDgJOzSw7SVdqOEK/mKcnZbMFl", + "cKJnbFCiA6lcZKaYs7YaPssGmhjWoYwjpwKyggAa82mcKDG8OG8N31x1ogDNmwWY5MPbGQ2RhHsn51vN", + "7QFudqRTyHPPq1xkzRi8rgDlaJVKcG0i5eTVQR1BBQxHPKTCAc1b+RCoh6Bx9VIf3EkImiAubKX8e44K", + "Bf7ed0qM1EhVy16oBcuxdkHA16Y9Im228ugVFnWJyi8Ihrp4qMjPPJUbu/H0prjR9Gat9JpJXOsO7VFH", + "yXJGjtjl6PRYR2Y+JQJighiIkICmVCl3nKhOtb2m15LOQABRRAmgk8lfVx8wVuRuUnZZFf0f5XOGDxf5", + "46lJBZZzD5yGcxSACBI8QVwA82Z+ZT6D/Sf7Azj2e/3dAE32nuy32213Wl2wRUwxcSz1In1Wbys6+lCq", + "lc3Z5rMv24cHODitg8sn7+zw7S/ewOsknHVC6sOww8eYDHL/Tv+ZPVA/9D/HmDgPXFNnuQSp8g2NKyfN", + "kRYjgDmYQByWqvZiabP03wcSE4L8lCGp8hLX5ibdlvy1ZM0Qf0QBcJahCDgF0v9WHPdl9SZN70OCEjSK", + "Kcd69SUrap5IvThOcBgANSJfwSf0n4oZ7X4l+jm9rPLEq0JK6xipd8yaCRE41KlSd6DtCDBrgFLhrjxX", + "OJunmTGOEQl06CPZQP/yKZlLqVD/UPBJPaMZp6DA7bOlzbil7AaT6SjADu78VT8EAWbIF+ocf70MeR0Y", + "x+tZ0e38pTotRX9N3siUzDisyzfX5J+TcC2u/mb69w//n589/b334dXV1T/mJ38/fo3/cRWevfmic/7V", + "1VLftORp5ZmayjIWSp3qsscpFL7D8ZlRLiqoZp4AQUEkB7fBkQrQBtekBV5hgRgMB+DagzFuG2K2fRpd", + "e6CB7qAv9ChACZBTgRmCAWI7cvCZrnWQgz/ZGPC+PEewIDDCPmCGyOkZOk/GAY0gJjvX5JqYuYBFhKtD", + "G/krAD6MRcKQ3BHpa4YLMGZQhpsmjM4Wb4JPMI7vd66JikTRnWASgxgykZZW2hXURhuo9KGQeR0FYA7D", + "BHETyV6T1H6o0FxOIiCbItFOU4gqUVM6mKkgijPMoEwUzpYPuk3HPgL5ntzIEHOBCEizEpgr5gUNWxlw", + "0C2I/0H3YP35Y8pDK9hPcfdyPb9lyhryoRlYLa2V8WgmRLy+QF/pGy0j4Je3b88kGeR/L4CdKKNFusU6", + "GINxHGLE9amaCJVPYooxdjzXyZne3ZoIvdUvy2EhX4/HC7UwePvqAgjEIky0/m74kpwT7Ev81PkO5jyR", + "rIghODw6fbHTrnEhQdE2hX/FPr5NMSwdI9jk1nKEqUZkSXNJ3yYYHjelO2UkNHO01LnpS8pAqBVMJtcD", + "cMlRsYpBbZU+4tE7GS6yDJnW6tfejp0xLmuKAThP/TuYgpKWfGfMYKfM5FJNe01+lYyhD3WXZm8WYVXH", + "1SZ+MapNHeFCAUzSW5nialWwWvwdFFcyT0k597iZbOeTlnIxN2tke//gHsjuprHkpuWzxUqgXOVXWkH7", + "bUtfP6eQ1e7QydmlKheFfMQJjPmMiuriDAjsOwDdYS74cuForXKC5cLZonnSJbErqrG+ZgksSwhRlRFl", + "NL56ceu3rDX4/gprV5bCfmk9q3HQHqictVIhuEpBi7pB//nrFqY+CDiFElOXMsjbMVsI9tlVpU0PO4pg", + "DjnHU4ICMDzLrlplCQ87fQmnZ/12b/+g3et2271unfRPBP0Va58eHtVfvNvXAfEAjgd+MECTL0g/GcbW", + "DgcMb+GCg2vrEl572gfNOZ85sTVuY62jveXi3c+r1S0bwXXVuJtU39bS96rovsK1uFAF+Zv7FU8q/Yq1", + "u8oFFKiuGb5QL9tRo00Sowj4NAkD8icBxlLydCiAAhOxcCQ0p+h3MQeX5IbQW1JEXefHpPx+SBBbgKvT", + "00I2laFJwutVenJB47hyH2i80Tb017h3a6HJFVtvo8C6rAlzFuirl1PnUz+2rkNzXY0UUOb+OY9JMdHk", + "lnu/AqdS8B6g+ShJXI6OfGQrNC8vh8eFDYdwv3fQPXjWOhj39lt7QbfXgr3d/Vb/CexOdv2nu73+7ooj", + "1RplEp9f+VCU0OqKaEV4lQjTRezBQMpQWrowTgRIbwZJ4TySHiPI+aG6/lfFpufaJZUzKOvqyyfhInVV", + "Vw4+g1JQ7dhY/Wv1iItZIqQbpMbwWSKA/JcCWaJgXP3VU2iZH4DXVI0xkDaloSzFDPp1SILxYvn1cnzR", + "MBUgDHFBGQrUYkaBDcDLVGmlas+ouQZH5qfWpaZSSlWB7eiA2rj3Zre8pmeo7jU9TUKv6VnKyJ8aQ/VL", + "Ae81PQOIs8hyuUxhZWVEVmpRPlffpJAmq6LHXM2KczUcoCEZMC/MuUrwnTo+mttRketUtQqQola3xmV1", + "ScsZFLMhmdDllMImxtIcFNoETiyZhnOp5AJEMAps7VRqNQ0fqqPHkCMQJMhQTvMVg4bgUKdVYihmStDV", + "QEymxaKrpQXrmDANw+o7E2pd82Idb5u7D7feskTRSgfDHMDsmKtWZI/5yK2RlydmaJqEkIFyHdcKkPki", + "CjG5qTM7X0RjGmIfyAFlV2hCw5DejuQj/jeFy04t7OSAUZbRLbk2GjiTz9cbUlo3Q+FvEsud0gmhL/2Q", + "jh7fUb1g6gQvzkqnl9Lw6VKnS4LvcoxeLEDe63erDoQrJi0cBS+XydWpKc7LvmFZl8TbCrbD9HakI52o", + "M10l63+katf0sGK1gLMsSWUEVx1/p1PlzsBtPGQLvIt0zRVa16rrtmrYeXUh9TEqjkR1ZYsz7WOndWvu", + "YT5tXs7HzCN3ga0M0qqodapDuGV6FbLMTw6ePdvde/KsX4s0Js5OEzUViduqZI2FoMORX7qIXNyx/pOu", + "+t9GQOlUjRukinRNEaDCpeLPBuh+hfhkBaAlNyKVjxXdorKdtLWiha3cO6hFrRUey2HB7cn1lmigyQQp", + "x3ek6dbKgCkdSNaCwYcx9LFYOM7s4a06owHpK6VCxhqzl4B1kNTMDeBEyLhzjhhPxlklf8MuDv4MVA6z", + "xAsHtS+t8GQ8UjM40r3lVdV75lAzKAW/WbhNk3GYO0ox19GkmVAc4crg36bEBLeQF7IS8rcvUNDM9Q4p", + "p6/0G6vuhC3XAapCZnM3K5eYdxXjlmyQGZTf/tJ2Nr28NcnYuUzxVWasWgSlVVYnpnUSBA6r6Kjw9def", + "AJX0g7GDnzdqNM5fJ1t5X69w96x205XlZbUh2hzc3EHCJgPLF2QUWxkYDOWyuZuFnXUxhU7zVN2+jmw3", + "xNL9GcwFoBNbJA1yL4MGimKxsIWANsjb2SztdJhO6OSpr3x02332NYrHLldWi/2X3OfPZ/rsImtzfEt7", + "Wlmi4fY6j8unaDq8MvcZi6c+pVtaXLSqndJVLTt170wVO5nyqGlSrufeoE1nVbScSY6uwsj6dK4LAiuK", + "IfRl3xxmOUiq90aneb+wpynmtpnpZ5LMRDLr6410qkzGkq3yhVd9XYZhFRoZAmnCShKk0e5ySL369OkU", + "3qUrqMATclDqy6LxyPU4O3mursCd24uPeGKnUGCUO+w8/7Jmr5arljdjVfdXe5DgFDyjf1ZotCrZKjFn", + "tkZzdYNZqbqQnzAsFhfSIJgzcgQZYoeJZkNlKRQS6s/Z4qrm7v5eRZsTh9N5gghi2AeHZ0PFJREkcCq3", + "7OoUhHiC/IUfIlMytZRiVjf+3hwNW7rW09YIqBNrLBRBbDOMw7OhuofPuF632+63VZ83GiMCY+wNvN12", + "T3UakGRQKHZyt9ZMUkcKojJlw8CY3GPzjiQujynh+v1+t6uvuhJhlCvMbjt3fuc6XaENbG3vrupG2/1S", + "MZB1Bphqv4Ekp1tk7pveXre3EXBrbyu7QLgkMBEzyvBHFMhFn2xIkc9adEh0rGyb0iHzYsbC3uC3IvP+", + "9u7+XdPjSRRB6TFq0rnpFlPu4IJ8+1lPyxji4jkNFl8NX1eH2/uiQEvtdb/EhF9vny3vLdPctBDJSKZZ", + "bAu7/RwGaTFkw9z+Tos8C31KvhXT73X3Hn7RXHub9F46oLq0VAPx7OGBOKJkEmJfgJaFxXQ9BTDUHZOK", + "DPJY1MG5gRpAi1f5BE5OZ01Fp5DMqjQapY7t27Ee5TbxG5iR7JQw47WflmQd6xxj7kvnMs8tLR/Gucb0", + "PJPTPBd9wsG99pVCpOsKijx0rP6empwYMhghgRhXMFX0IQZZ03YsH9gzJBXm6iCyaE6aORqWfcl3Sxy7", + "V9lcKiFl27AFpXhcUojfUBGWSiBz5+qPiZsv0120HaXum24Nd4LE98Wa3e15Qbbr0Ldk88fCUSdIWBFJ", + "ySa14CxtulDFXqYtwwNutFnBgfiFjD61VGtAdeldhpYeCvwZ8m80Qqbx2SqPYGh7oz28H6B7S2xg/Q34", + "P819jcAxo9WqYHFo6jEfLlYsfE6jVqjY/2oQGAZzEFldJhnby/q61BHyBfF3vkXM+N8dFZYblT0iSTpL", + "wlC12jXNGrIOG3l92vkk/YMafrKVtpW+yOX5qxYiPg1QYC49VTsk9kL91/WW9YZpVH6ySZ34SpHKMka1", + "M/oF+69PDrKvCf2x/9Lcdftj/6W+7fbH3cPso0IPwyzdbanmbXuvj5j5pPOKi0RTqklfe1/n7aVvbcXh", + "M/1FNnH5UgB/en11vL48uVY6fmmrlwd0/YofHNvyOUHKbC5qq0f21ssP5vJtN/VkODLXFLWQizcXF9Qn", + "pkzXCtNS8RGJnqk4wCnH5fVvzRxqJpArvQPLusPjpmlIotuIxAxN8N32MqoWjq17iWbd7adTD6MxniY0", + "4fm+B6r/DOJZs+6CAn5s/mtmnis92O+YS7vbNB1bd1B/8v0Duc7lDdXK2/T3XuM827e24zxnRzX1vWcL", + "4U/vuZb3nCPXau85vVr+kO5z8YuvW/efLb+5CG7qKn9ED/qReaWQmBx37rC3oONqO6hZO4XVtj/7+NrW", + "D/rTxbfvl9o7io8xh6T6naivxFpPMLM11a7g98YP3e3qvu27gI+ZxU7yrTTdzpZSRJ2QTvNuV7kxD0Mw", + "yhrgAfk2gBxcKMBaF4gI8GIusWpfE9t3872+JPUepIyqPxcdIl+YjzKGVH10kKv5VX+U9zCO36f9ZXcG", + "4ESVd+aoqxdvcMQwDIFPCaeh7jPyfh5F7wfLdeJXp6dqkHpnpivC3w/SDyWmMsblW9fkmpwjkTDCFRYh", + "5AK8BiEmiIOG3HBGw1B/ROu9pGcOvx3VxFPOqBs4hotrIkdgkiBusMRkCgi6NRPiCXivGw6o+wfvdT/P", + "Sql/JXfpG0l+s7pLksZFUMAU4XRPVKS+jqTWVV2jsoXNl5uypdJbFL2u87rTp+VEl6Kpk6T6Di0mWEj+", + "oInQX4NyAaIp7wal8orP8oerpsBeMC2wMozjuuxrwFRcPI+iFTwMGlkfScBFQBPxFy4CxHSrbMPdVcwN", + "GtDX/xDwRjd2LnS21N1vXKQyt3mdpPJ0+3rbNEf/ax5Fnm6zGUFXE5walkSgO9FBUq20NFmLOrU84XI8", + "JndGDQSNi4sXOz9tRk23RJGsqOwNAR2Ww3RfUjfVnMHbuX7hh/dcbJuqb8yG2z+KyEGBVatBEowX5qvF", + "aV/hR3UnQG1khpmydwYvp4zYZ5UyYtqG/fAykvHHDy4lPmXqWwPc9v58PMVbuYgjJ+4N1Wwwa+LXtFHv", + "1enpTpXQ6Hb1lSLDfobDpo7yh7cpqv/i45MW3XoYpgisShZKgRCVMbqNWQu9WdWHbOFyywTVV48vuECR", + "DtgnSagutqmqdRlP4Ykdp2sFmgALrjroNFXKKtcz7pqM0UTawxgxubYcrnqrZrGHK6y9EDAV3zMtg99H", + "XKu6KKhQDooqqi19vss2UHDFTmnPh88G6aUKVIt9CzlohPhGNzIFcw5C+WNnZaSrmxpuFu8+pIZL23a6", + "brVqnk2Z+UfQcMOSWrMtfR+dWjtBeWGx+kdttEut0XiVmafxTytv+gv/9IkfpU+sDnpSbBpTBn1lcbnp", + "IO32f03r0M4n/WO47rhQQH92ZdtPfR+m1HSrWbeMRfBRCKXBKUD6Su/2ZZKmDYUe6bUNSTiLgkqd5A8+", + "3VZANyr70bj769e45Om4UYXLVmXLXpf/bmRr25bPwGDLtfP0eCxirjnNYiJoKbRl+canKwNa29BSdeG1", + "w9IOss18W2D9CeQ0QM0a0aUdSNvXJG25CjDxwyRA4Ojssmk+MNRUnzDSM5iGnW3g7ozLAWTItse9JoIC", + "H4Z+EkKBQNoiVrd15hXHuue5tskPJm/ZIo6NTnvj8rQH6mOKMdw8oXYv32RVcVzu6zyVtaXmQz1bqSw1", + "xmyDulKLwc8SvBpVpTliWZfC1bCSA6hqB/TrbXCRxDFlggNxS9U3Erg6y//7xZvXYEyDxQCk4wjQbXWN", + "ijP9UHmMfDzBKFA9KeXYU1WtDZn6cHCUm8COjBlqxTRWqiPQV3YMjbV7BIGArD39CCDzZ3iOHMpEz5n6", + "Rw9XHlt2HZpeZNHrSPRUF9zipKWPMmSwFPejiKPOIJrv5usvcKZNTe0Uuc7AY0wgW9RtC/zG1OcAP+GC", + "Rnbe4TFowOI3N03L5JjROQ7Krci/k/7Bp/AOR0mUfnzj5Ln6lAHTpR7qozaq0MjyFLrzEQq4qvzY2bDX", + "8HKbYbMXjr6hW62bttq00qf8hjXTWXNCucXSx7RMLigFIWRTtPPD3Ew0spZdTBwel64lPsJq77nlvszP", + "qFnfXS+krRlpPkRtd5ru2G5l99X3E4Xl+rc9wuuF89TNrCop/75YsLs9k7DtUvKrR5y1k9HWvEQ2PYGc", + "0cUwr6gPQxCgOQpprLrp63e9ppew0PQGH3Q6MkwLZSA3OOgedL37d/f/GwAA//+vyisL97MAAA==", } // GetSwagger returns the content of the embedded swagger specification file diff --git a/lib/resources/gpu.go b/lib/resources/gpu.go new file mode 100644 index 00000000..9349a5fd --- /dev/null +++ b/lib/resources/gpu.go @@ -0,0 +1,101 @@ +package resources + +import ( + "github.com/onkernel/hypeman/lib/devices" +) + +// GPUResourceStatus represents the GPU resource status for the API response. +// Returns nil if no GPU is available on the host. +type GPUResourceStatus struct { + Mode string `json:"mode"` // "vgpu" or "passthrough" + TotalSlots int `json:"total_slots"` // VFs for vGPU, physical GPUs for passthrough + UsedSlots int `json:"used_slots"` // Slots currently in use + Profiles []devices.GPUProfile `json:"profiles,omitempty"` // vGPU mode only + Devices []devices.PassthroughDevice `json:"devices,omitempty"` // passthrough mode only +} + +// GetGPUStatus returns the current GPU resource status. +// Returns nil if no GPU is available or the mode is "none". +func GetGPUStatus() *GPUResourceStatus { + mode := devices.DetectHostGPUMode() + if mode == devices.GPUModeNone { + return nil + } + + switch mode { + case devices.GPUModeVGPU: + return getVGPUStatus() + case devices.GPUModePassthrough: + return getPassthroughStatus() + default: + return nil + } +} + +// getVGPUStatus returns GPU status for vGPU mode (SR-IOV + mdev). +func getVGPUStatus() *GPUResourceStatus { + vfs, err := devices.DiscoverVFs() + if err != nil || len(vfs) == 0 { + return nil + } + + // Count used VFs (those with mdevs) + usedSlots := 0 + for _, vf := range vfs { + if vf.HasMdev { + usedSlots++ + } + } + + // Get available profiles (reuse VFs to avoid redundant discovery) + profiles, err := devices.ListGPUProfilesWithVFs(vfs) + if err != nil { + profiles = nil + } + + return &GPUResourceStatus{ + Mode: string(devices.GPUModeVGPU), + TotalSlots: len(vfs), + UsedSlots: usedSlots, + Profiles: profiles, + } +} + +// getPassthroughStatus returns GPU status for whole-GPU passthrough mode. +func getPassthroughStatus() *GPUResourceStatus { + available, err := devices.DiscoverAvailableDevices() + if err != nil || len(available) == 0 { + return nil + } + + // Filter to GPUs only and build passthrough device list + var passthroughDevices []devices.PassthroughDevice + for _, dev := range available { + // NVIDIA vendor ID is 0x10de + if dev.VendorID == "10de" { + passthroughDevices = append(passthroughDevices, devices.PassthroughDevice{ + Name: dev.DeviceName, + Available: dev.CurrentDriver == nil || *dev.CurrentDriver != "vfio-pci", + }) + } + } + + if len(passthroughDevices) == 0 { + return nil + } + + // Count used (those bound to vfio-pci, likely attached to a VM) + usedSlots := 0 + for _, dev := range passthroughDevices { + if !dev.Available { + usedSlots++ + } + } + + return &GPUResourceStatus{ + Mode: string(devices.GPUModePassthrough), + TotalSlots: len(passthroughDevices), + UsedSlots: usedSlots, + Devices: passthroughDevices, + } +} diff --git a/lib/resources/resource.go b/lib/resources/resource.go index 2c0d4730..39b1f8eb 100644 --- a/lib/resources/resource.go +++ b/lib/resources/resource.go @@ -79,6 +79,7 @@ type FullResourceStatus struct { Disk ResourceStatus `json:"disk"` Network ResourceStatus `json:"network"` DiskDetail *DiskBreakdown `json:"disk_breakdown,omitempty"` + GPU *GPUResourceStatus `json:"gpu,omitempty"` // nil if no GPU available Allocations []AllocationBreakdown `json:"allocations"` } @@ -323,12 +324,16 @@ func (m *Manager) GetFullStatus(ctx context.Context) (*FullResourceStatus, error } } + // Get GPU status + gpuStatus := GetGPUStatus() + return &FullResourceStatus{ CPU: *cpuStatus, Memory: *memStatus, Disk: *diskStatus, Network: *netStatus, DiskDetail: diskBreakdown, + GPU: gpuStatus, Allocations: allocations, }, nil } diff --git a/lib/system/init/drivers.go b/lib/system/init/drivers.go deleted file mode 100644 index 935e0f9e..00000000 --- a/lib/system/init/drivers.go +++ /dev/null @@ -1,190 +0,0 @@ -package main - -import ( - "fmt" - "os" - "os/exec" - "path/filepath" - "runtime" - "strings" -) - -// loadGPUDrivers loads NVIDIA kernel modules for GPU passthrough. -func loadGPUDrivers(log *Logger) error { - log.Info("gpu", "loading NVIDIA kernel modules") - - // Find kernel version directory - modules, err := os.ReadDir("/lib/modules") - if err != nil { - return fmt.Errorf("read /lib/modules: %w", err) - } - - if len(modules) == 0 { - return fmt.Errorf("no kernel modules found") - } - - kver := modules[0].Name() - gpuDir := filepath.Join("/lib/modules", kver, "kernel/drivers/gpu") - - if _, err := os.Stat(gpuDir); err != nil { - return fmt.Errorf("GPU modules not found for kernel %s", kver) - } - - // Load modules in order (dependencies first) - moduleOrder := []string{ - "nvidia.ko", - "nvidia-uvm.ko", - "nvidia-modeset.ko", - "nvidia-drm.ko", - } - - for _, mod := range moduleOrder { - modPath := filepath.Join(gpuDir, mod) - if _, err := os.Stat(modPath); err != nil { - log.Error("gpu", fmt.Sprintf("%s not found", mod), nil) - continue - } - - args := []string{modPath} - // nvidia-drm needs modeset=1 - if mod == "nvidia-drm.ko" { - args = append(args, "modeset=1") - } - - cmd := exec.Command("/sbin/insmod", args...) - if output, err := cmd.CombinedOutput(); err != nil { - log.Error("gpu", fmt.Sprintf("insmod %s failed", mod), fmt.Errorf("%s", output)) - } - } - - log.Info("gpu", fmt.Sprintf("loaded NVIDIA modules for kernel %s", kver)) - - // Create device nodes using nvidia-modprobe if available - if err := createNvidiaDevices(log); err != nil { - log.Error("gpu", "failed to create device nodes", err) - } - - // Inject NVIDIA userspace driver libraries into container rootfs - if err := injectNvidiaLibraries(log); err != nil { - log.Error("gpu", "failed to inject driver libraries", err) - } - - return nil -} - -// createNvidiaDevices creates NVIDIA device nodes. -func createNvidiaDevices(log *Logger) error { - // Try nvidia-modprobe first (the official NVIDIA utility) - if _, err := os.Stat("/usr/bin/nvidia-modprobe"); err == nil { - log.Info("gpu", "running nvidia-modprobe to create device nodes") - - cmd := exec.Command("/usr/bin/nvidia-modprobe") - cmd.CombinedOutput() - - cmd = exec.Command("/usr/bin/nvidia-modprobe", "-u", "-c=0") - cmd.CombinedOutput() - - return nil - } - - // Fallback: Manual device node creation - log.Info("gpu", "nvidia-modprobe not found, creating device nodes manually") - - // Read major numbers from /proc/devices - data, err := os.ReadFile("/proc/devices") - if err != nil { - return err - } - - lines := strings.Split(string(data), "\n") - var nvidiaMajor, uvmMajor string - - for _, line := range lines { - fields := strings.Fields(line) - if len(fields) >= 2 { - if fields[1] == "nvidia-frontend" || fields[1] == "nvidia" { - nvidiaMajor = fields[0] - } else if fields[1] == "nvidia-uvm" { - uvmMajor = fields[0] - } - } - } - - if nvidiaMajor != "" { - exec.Command("/bin/mknod", "-m", "666", "/dev/nvidiactl", "c", nvidiaMajor, "255").Run() - exec.Command("/bin/mknod", "-m", "666", "/dev/nvidia0", "c", nvidiaMajor, "0").Run() - log.Info("gpu", fmt.Sprintf("created /dev/nvidiactl and /dev/nvidia0 (major %s)", nvidiaMajor)) - } - - if uvmMajor != "" { - exec.Command("/bin/mknod", "-m", "666", "/dev/nvidia-uvm", "c", uvmMajor, "0").Run() - exec.Command("/bin/mknod", "-m", "666", "/dev/nvidia-uvm-tools", "c", uvmMajor, "1").Run() - log.Info("gpu", fmt.Sprintf("created /dev/nvidia-uvm* (major %s)", uvmMajor)) - } - - return nil -} - -// injectNvidiaLibraries injects NVIDIA userspace driver libraries into the container rootfs. -// This allows containers to use standard CUDA images without bundled drivers. -func injectNvidiaLibraries(log *Logger) error { - srcDir := "/usr/lib/nvidia" - if _, err := os.Stat(srcDir); err != nil { - return nil // No driver libraries to inject - } - - log.Info("gpu", "injecting NVIDIA driver libraries into container") - - // Determine library path based on architecture - var libDst string - if runtime.GOARCH == "arm64" { - libDst = "/overlay/newroot/usr/lib/aarch64-linux-gnu" - } else { - libDst = "/overlay/newroot/usr/lib/x86_64-linux-gnu" - } - binDst := "/overlay/newroot/usr/bin" - - if err := os.MkdirAll(libDst, 0755); err != nil { - return err - } - if err := os.MkdirAll(binDst, 0755); err != nil { - return err - } - - // Copy all driver libraries - libs, _ := filepath.Glob(filepath.Join(srcDir, "*.so.*")) - for _, lib := range libs { - libname := filepath.Base(lib) - data, err := os.ReadFile(lib) - if err != nil { - continue - } - os.WriteFile(filepath.Join(libDst, libname), data, 0755) - - // Create standard symlinks - base := strings.Split(libname, ".so.")[0] - os.Symlink(libname, filepath.Join(libDst, base+".so.1")) - os.Symlink(base+".so.1", filepath.Join(libDst, base+".so")) - } - - // Copy nvidia-smi and nvidia-modprobe binaries - for _, bin := range []string{"nvidia-smi", "nvidia-modprobe"} { - srcPath := filepath.Join("/usr/bin", bin) - if data, err := os.ReadFile(srcPath); err == nil { - os.WriteFile(filepath.Join(binDst, bin), data, 0755) - } - } - - // Update ldconfig cache - exec.Command("/usr/sbin/chroot", "/overlay/newroot", "ldconfig").Run() - - // Read driver version - version := "unknown" - if data, err := os.ReadFile(filepath.Join(srcDir, "version")); err == nil { - version = strings.TrimSpace(string(data)) - } - - log.Info("gpu", fmt.Sprintf("injected NVIDIA driver libraries (version: %s)", version)) - return nil -} - diff --git a/lib/system/init/main.go b/lib/system/init/main.go index a98c47fe..96ccebca 100644 --- a/lib/system/init/main.go +++ b/lib/system/init/main.go @@ -46,15 +46,7 @@ func main() { } } - // Phase 5: Load GPU drivers if needed - if cfg.HasGPU { - if err := loadGPUDrivers(log); err != nil { - log.Error("gpu", "failed to load GPU drivers", err) - // Continue anyway - } - } - - // Phase 6: Mount volumes + // Phase 5: Mount volumes if len(cfg.VolumeMounts) > 0 { if err := mountVolumes(log, cfg); err != nil { log.Error("volumes", "failed to mount volumes", err) @@ -62,19 +54,19 @@ func main() { } } - // Phase 7: Bind mount filesystems to new root + // Phase 6: Bind mount filesystems to new root if err := bindMountsToNewRoot(log); err != nil { log.Error("bind", "failed to bind mounts", err) dropToShell() } - // Phase 8: Copy guest-agent to target location + // Phase 7: Copy guest-agent to target location if err := copyGuestAgent(log); err != nil { log.Error("agent", "failed to copy guest-agent", err) // Continue anyway - exec will still work, just no remote access } - // Phase 9: Mode-specific execution + // Phase 8: Mode-specific execution if cfg.InitMode == "systemd" { log.Info("mode", "entering systemd mode") runSystemdMode(log, cfg) @@ -94,4 +86,3 @@ func dropToShell() { cmd.Run() os.Exit(1) } - diff --git a/lib/system/initrd.go b/lib/system/initrd.go index 567247d2..f3e1260e 100644 --- a/lib/system/initrd.go +++ b/lib/system/initrd.go @@ -1,21 +1,16 @@ package system import ( - "archive/tar" - "compress/gzip" "context" "crypto/sha256" "encoding/hex" "fmt" - "io" - "net/http" "os" "path/filepath" "strconv" "time" "github.com/onkernel/hypeman/lib/images" - "github.com/onkernel/hypeman/lib/logger" ) const alpineBaseImage = "alpine:3.22" @@ -60,13 +55,6 @@ func (m *manager) buildInitrd(ctx context.Context, arch string) (string, error) return "", fmt.Errorf("write guest-agent: %w", err) } - // Add NVIDIA kernel modules (for GPU passthrough support) - if err := m.addNvidiaModules(ctx, rootfsDir, arch); err != nil { - // Log but don't fail - NVIDIA modules are optional (not available on all architectures) - log := logger.FromContext(ctx) - log.InfoContext(ctx, "skipping NVIDIA modules", "error", err) - } - // Write shell wrapper as /init (sets up /proc, /sys, /dev before Go runtime) // The Go runtime needs these filesystems during initialization initWrapperPath := filepath.Join(rootfsDir, "init") @@ -153,168 +141,11 @@ func (m *manager) isInitrdStale(initrdPath, arch string) bool { return string(storedHash) != currentHash } -// computeInitrdHash computes a hash of the embedded binaries and NVIDIA assets for a specific architecture -func computeInitrdHash(arch string) string { +// computeInitrdHash computes a hash of the embedded binaries +func computeInitrdHash(_ string) string { h := sha256.New() h.Write(GuestAgentBinary) h.Write(InitBinary) h.Write(InitWrapper) - // Include NVIDIA driver version in hash so initrd is rebuilt when driver changes - if ver, ok := NvidiaDriverVersion[DefaultKernelVersion]; ok { - h.Write([]byte(ver)) - } - // Include driver libs URL so initrd is rebuilt when the libs tarball changes - if archURLs, ok := NvidiaDriverLibURLs[DefaultKernelVersion]; ok { - if url, ok := archURLs[arch]; ok { - h.Write([]byte(url)) - } - } return hex.EncodeToString(h.Sum(nil))[:16] } - -// addNvidiaModules downloads and extracts NVIDIA kernel modules into the rootfs -func (m *manager) addNvidiaModules(ctx context.Context, rootfsDir, arch string) error { - // Check if NVIDIA modules are available for this architecture - archURLs, ok := NvidiaModuleURLs[DefaultKernelVersion] - if !ok { - return fmt.Errorf("no NVIDIA modules for kernel version %s", DefaultKernelVersion) - } - url, ok := archURLs[arch] - if !ok { - return fmt.Errorf("no NVIDIA modules for architecture %s", arch) - } - - // Download the tarball - client := &http.Client{ - CheckRedirect: func(req *http.Request, via []*http.Request) error { - return nil // Follow redirects - }, - } - - req, err := http.NewRequestWithContext(ctx, "GET", url, nil) - if err != nil { - return fmt.Errorf("create request: %w", err) - } - - resp, err := client.Do(req) - if err != nil { - return fmt.Errorf("download nvidia modules: %w", err) - } - defer resp.Body.Close() - - if resp.StatusCode != http.StatusOK { - return fmt.Errorf("download failed with status %d", resp.StatusCode) - } - - // Extract tarball directly into rootfs - if err := extractTarGz(resp.Body, rootfsDir); err != nil { - return fmt.Errorf("extract nvidia modules: %w", err) - } - - // Add userspace driver libraries (libcuda.so, libnvidia-ml.so, nvidia-smi, etc.) - // These are injected into containers at boot time - see lib/devices/GPU.md - if err := m.addNvidiaDriverLibs(ctx, rootfsDir, arch); err != nil { - log := logger.FromContext(ctx) - log.WarnContext(ctx, "could not add nvidia driver libs", "error", err) - // Don't fail - kernel modules can still work, but containers won't have driver libs - } - - return nil -} - -// addNvidiaDriverLibs downloads and extracts NVIDIA userspace driver libraries -// These libraries (libcuda.so, libnvidia-ml.so, nvidia-smi, etc.) are injected -// into containers at boot time, eliminating the need for containers to bundle -// matching driver versions. See lib/devices/GPU.md for documentation. -func (m *manager) addNvidiaDriverLibs(ctx context.Context, rootfsDir, arch string) error { - archURLs, ok := NvidiaDriverLibURLs[DefaultKernelVersion] - if !ok { - return fmt.Errorf("no NVIDIA driver libs for kernel version %s", DefaultKernelVersion) - } - url, ok := archURLs[arch] - if !ok { - return fmt.Errorf("no NVIDIA driver libs for architecture %s", arch) - } - - client := &http.Client{ - CheckRedirect: func(req *http.Request, via []*http.Request) error { - return nil // Follow redirects - }, - } - - req, err := http.NewRequestWithContext(ctx, "GET", url, nil) - if err != nil { - return fmt.Errorf("create request: %w", err) - } - - resp, err := client.Do(req) - if err != nil { - return fmt.Errorf("download nvidia driver libs: %w", err) - } - defer resp.Body.Close() - - if resp.StatusCode != http.StatusOK { - return fmt.Errorf("download failed with status %d", resp.StatusCode) - } - - // Extract tarball directly into rootfs - if err := extractTarGz(resp.Body, rootfsDir); err != nil { - return fmt.Errorf("extract nvidia driver libs: %w", err) - } - - log := logger.FromContext(ctx) - log.InfoContext(ctx, "added NVIDIA driver libraries", "url", url) - return nil -} - -// extractTarGz extracts a gzipped tarball into the destination directory -func extractTarGz(r io.Reader, destDir string) error { - gzr, err := gzip.NewReader(r) - if err != nil { - return fmt.Errorf("create gzip reader: %w", err) - } - defer gzr.Close() - - tr := tar.NewReader(gzr) - for { - header, err := tr.Next() - if err == io.EOF { - break - } - if err != nil { - return fmt.Errorf("read tar: %w", err) - } - - // Calculate destination path - destPath := filepath.Join(destDir, header.Name) - - switch header.Typeflag { - case tar.TypeDir: - if err := os.MkdirAll(destPath, os.FileMode(header.Mode)); err != nil { - return fmt.Errorf("create directory %s: %w", destPath, err) - } - case tar.TypeReg: - // Ensure parent directory exists - if err := os.MkdirAll(filepath.Dir(destPath), 0755); err != nil { - return fmt.Errorf("create parent dir: %w", err) - } - - outFile, err := os.Create(destPath) - if err != nil { - return fmt.Errorf("create file %s: %w", destPath, err) - } - - if _, err := io.Copy(outFile, tr); err != nil { - outFile.Close() - return fmt.Errorf("write file %s: %w", destPath, err) - } - outFile.Close() - - if err := os.Chmod(destPath, os.FileMode(header.Mode)); err != nil { - return fmt.Errorf("chmod %s: %w", destPath, err) - } - } - } - - return nil -} diff --git a/lib/system/versions.go b/lib/system/versions.go index 7d59cd85..510f5422 100644 --- a/lib/system/versions.go +++ b/lib/system/versions.go @@ -42,32 +42,6 @@ var KernelDownloadURLs = map[KernelVersion]map[string]string{ // Add future versions here } -// NvidiaModuleURLs maps kernel versions and architectures to NVIDIA module tarball URLs -// These tarballs contain pre-built NVIDIA kernel modules that match the kernel version -var NvidiaModuleURLs = map[KernelVersion]map[string]string{ - Kernel_20251213: { - "x86_64": "https://github.com/onkernel/linux/releases/download/ch-6.12.8-kernel-1.2-20251213/nvidia-modules-x86_64.tar.gz", - // Note: NVIDIA open-gpu-kernel-modules does not support arm64 yet - }, - // Kernel_202511182 and Kernel_20251211 do not have NVIDIA modules (pre-module-support kernels) -} - -// NvidiaDriverLibURLs maps kernel versions and architectures to driver library tarball URLs -// These tarballs contain userspace NVIDIA libraries (libcuda.so, libnvidia-ml.so, etc.) -// that match the kernel modules and are injected into containers at boot time. -// See lib/devices/GPU.md for documentation on driver injection. -var NvidiaDriverLibURLs = map[KernelVersion]map[string]string{ - Kernel_20251213: { - "x86_64": "https://github.com/onkernel/linux/releases/download/ch-6.12.8-kernel-1.2-20251213/nvidia-driver-libs-x86_64.tar.gz", - }, -} - -// NvidiaDriverVersion tracks the NVIDIA driver version bundled with each kernel -var NvidiaDriverVersion = map[KernelVersion]string{ - Kernel_20251213: "570.86.16", - // Kernel_202511182 and Kernel_20251211 do not have NVIDIA modules -} - // GetArch returns the architecture string for the current platform func GetArch() string { arch := runtime.GOARCH diff --git a/lib/vmconfig/config.go b/lib/vmconfig/config.go index db7c554f..bb7b9a0c 100644 --- a/lib/vmconfig/config.go +++ b/lib/vmconfig/config.go @@ -20,9 +20,6 @@ type Config struct { GuestGW string `json:"guest_gw,omitempty"` GuestDNS string `json:"guest_dns,omitempty"` - // GPU passthrough - HasGPU bool `json:"has_gpu"` - // Volume mounts VolumeMounts []VolumeMount `json:"volume_mounts,omitempty"` diff --git a/openapi.yaml b/openapi.yaml index fac72d3a..7b7da06a 100644 --- a/openapi.yaml +++ b/openapi.yaml @@ -167,6 +167,8 @@ components: type: string description: Device IDs or names to attach for GPU/PCI passthrough example: ["l4-gpu"] + gpu: + $ref: "#/components/schemas/GPUConfig" volumes: type: array description: Volumes to attach to the instance at creation time @@ -262,6 +264,8 @@ components: description: Volumes attached to the instance items: $ref: "#/components/schemas/VolumeMount" + gpu: + $ref: "#/components/schemas/InstanceGPU" created_at: type: string format: date-time @@ -587,6 +591,90 @@ components: enum: [gpu, pci] description: Type of PCI device + GPUConfig: + type: object + description: GPU configuration for the instance + properties: + profile: + type: string + description: vGPU profile name (e.g., "L40S-1Q"). Only used in vGPU mode. + example: "L40S-1Q" + + InstanceGPU: + type: object + description: GPU information attached to the instance + properties: + profile: + type: string + description: vGPU profile name + example: "L40S-1Q" + mdev_uuid: + type: string + description: mdev device UUID + example: "aa618089-8b16-4d01-a136-25a0f3c73123" + + GPUProfile: + type: object + description: Available vGPU profile + required: [name, framebuffer_mb, available] + properties: + name: + type: string + description: Profile name (user-facing) + example: "L40S-1Q" + framebuffer_mb: + type: integer + description: Frame buffer size in MB + example: 1024 + available: + type: integer + description: Number of instances that can be created with this profile + example: 59 + + PassthroughDevice: + type: object + description: Physical GPU available for passthrough + required: [name, available] + properties: + name: + type: string + description: GPU name + example: "NVIDIA L40S" + available: + type: boolean + description: Whether this GPU is available (not attached to an instance) + example: true + + GPUResourceStatus: + type: object + description: GPU resource status. Null if no GPUs available. + nullable: true + required: [mode, total_slots, used_slots] + properties: + mode: + type: string + enum: [vgpu, passthrough] + description: GPU mode (vgpu for SR-IOV/mdev, passthrough for whole GPU) + example: "vgpu" + total_slots: + type: integer + description: Total slots (VFs for vGPU, physical GPUs for passthrough) + example: 64 + used_slots: + type: integer + description: Slots currently in use + example: 5 + profiles: + type: array + description: Available vGPU profiles (only in vGPU mode) + items: + $ref: "#/components/schemas/GPUProfile" + devices: + type: array + description: Physical GPUs (only in passthrough mode) + items: + $ref: "#/components/schemas/PassthroughDevice" + CreateDeviceRequest: type: object required: [pci_address] @@ -795,6 +883,8 @@ components: $ref: "#/components/schemas/ResourceStatus" disk_breakdown: $ref: "#/components/schemas/DiskBreakdown" + gpu: + $ref: "#/components/schemas/GPUResourceStatus" allocations: type: array items: