diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4f9aa7d..00d46f8 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -34,6 +34,9 @@ jobs: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_PASSWORD }} + - name: Clean cached binaries + run: make clean + - name: Build run: make build diff --git a/Makefile b/Makefile index 8b06a3c..07b2bf0 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ SHELL := /bin/bash -.PHONY: oapi-generate generate-vmm-client generate-wire generate-all dev build test install-tools gen-jwt download-ch-binaries download-ch-spec ensure-ch-binaries build-caddy-binaries build-caddy ensure-caddy-binaries release-prep +.PHONY: oapi-generate generate-vmm-client generate-wire generate-all dev build test install-tools gen-jwt download-ch-binaries download-ch-spec ensure-ch-binaries build-caddy-binaries build-caddy ensure-caddy-binaries release-prep clean # Directory where local binaries will be installed BIN_DIR ?= $(CURDIR)/bin @@ -192,10 +192,8 @@ gen-jwt: $(GODOTENV) # Clean generated files and binaries clean: rm -rf $(BIN_DIR) - rm -f lib/oapi/oapi.go - rm -f lib/vmm/vmm.go - rm -f lib/exec/exec.pb.go - rm -f lib/exec/exec_grpc.pb.go + rm -rf lib/vmm/binaries/cloud-hypervisor/ + rm -rf lib/ingress/binaries/ rm -f lib/system/exec_agent/exec-agent # Prepare for release build (called by GoReleaser) diff --git a/lib/instances/start.go b/lib/instances/start.go index 149c199..a29c3ad 100644 --- a/lib/instances/start.go +++ b/lib/instances/start.go @@ -8,6 +8,7 @@ import ( "github.com/onkernel/hypeman/lib/logger" "github.com/onkernel/hypeman/lib/network" "go.opentelemetry.io/otel/trace" + "gvisor.dev/gvisor/pkg/cleanup" ) // startInstance starts a stopped instance @@ -52,46 +53,53 @@ func (m *manager) startInstance( return nil, fmt.Errorf("get image: %w", err) } - // 4. Recreate network allocation if network enabled + // Setup cleanup stack for automatic rollback on errors + cu := cleanup.Make(func() {}) + defer cu.Clean() + + // 4. Allocate fresh network if network enabled var netConfig *network.NetworkConfig if stored.NetworkEnabled { - log.DebugContext(ctx, "recreating network for start", "instance_id", id, "network", "default") - if err := m.networkManager.RecreateAllocation(ctx, id); err != nil { - log.ErrorContext(ctx, "failed to recreate network", "instance_id", id, "error", err) - return nil, fmt.Errorf("recreate network: %w", err) - } - // Get the network config for VM configuration - netAlloc, err := m.networkManager.GetAllocation(ctx, id) + log.DebugContext(ctx, "allocating network for start", "instance_id", id, "network", "default") + netConfig, err = m.networkManager.CreateAllocation(ctx, network.AllocateRequest{ + InstanceID: id, + InstanceName: stored.Name, + }) if err != nil { - log.ErrorContext(ctx, "failed to get network allocation", "instance_id", id, "error", err) - // Cleanup network on failure - if netAlloc != nil { - m.networkManager.ReleaseAllocation(ctx, netAlloc) - } - return nil, fmt.Errorf("get network allocation: %w", err) - } - netConfig = &network.NetworkConfig{ - TAPDevice: netAlloc.TAPDevice, - IP: netAlloc.IP, - MAC: netAlloc.MAC, - Netmask: "255.255.255.0", // Default netmask + log.ErrorContext(ctx, "failed to allocate network", "instance_id", id, "error", err) + return nil, fmt.Errorf("allocate network: %w", err) } + // Update stored metadata with new IP/MAC + stored.IP = netConfig.IP + stored.MAC = netConfig.MAC + // Add network cleanup to stack + cu.Add(func() { + m.networkManager.ReleaseAllocation(ctx, &network.Allocation{ + InstanceID: id, + TAPDevice: netConfig.TAPDevice, + }) + }) } - // 5. Start VMM and boot VM (reuses logic from create) + // 5. Regenerate config disk with new network configuration + instForConfig := &Instance{StoredMetadata: *stored} + log.DebugContext(ctx, "regenerating config disk", "instance_id", id) + if err := m.createConfigDisk(instForConfig, imageInfo, netConfig); err != nil { + log.ErrorContext(ctx, "failed to create config disk", "instance_id", id, "error", err) + return nil, fmt.Errorf("create config disk: %w", err) + } + + // 6. Start VMM and boot VM (reuses logic from create) log.InfoContext(ctx, "starting VMM and booting VM", "instance_id", id) if err := m.startAndBootVM(ctx, stored, imageInfo, netConfig); err != nil { log.ErrorContext(ctx, "failed to start and boot VM", "instance_id", id, "error", err) - // Cleanup network on failure - if stored.NetworkEnabled { - if netAlloc, err := m.networkManager.GetAllocation(ctx, id); err == nil { - m.networkManager.ReleaseAllocation(ctx, netAlloc) - } - } return nil, err } - // 6. Update metadata (set PID, StartedAt) + // Success - release cleanup stack (prevent cleanup) + cu.Release() + + // 7. Update metadata (set PID, StartedAt) now := time.Now() stored.StartedAt = &now