Skip to content

Commit d223fee

Browse files
committed
Fix cleanup of vm
1 parent 4abaf5e commit d223fee

File tree

5 files changed

+61
-75
lines changed

5 files changed

+61
-75
lines changed

lib/exec/client.go

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -67,15 +67,17 @@ func getOrCreateConn(ctx context.Context, dialer hypervisor.VsockDialer) (*grpc.
6767
return conn, nil
6868
}
6969

70-
// CloseConn closes and removes a connection from the pool (call when VM is deleted)
70+
// CloseConn removes a connection from the pool (call when VM is deleted).
71+
// We only remove from pool, not explicitly close - the connection will fail
72+
// naturally when the VM dies, and grpc will clean up. Calling Close() on a
73+
// connection with an active reader can cause panics in grpc internals.
7174
func CloseConn(dialerKey string) {
7275
connPool.Lock()
7376
defer connPool.Unlock()
7477

75-
if conn, ok := connPool.conns[dialerKey]; ok {
76-
conn.Close()
78+
if _, ok := connPool.conns[dialerKey]; ok {
7779
delete(connPool.conns, dialerKey)
78-
slog.Debug("closed gRPC connection", "key", dialerKey)
80+
slog.Debug("removed gRPC connection from pool", "key", dialerKey)
7981
}
8082
}
8183

lib/hypervisor/qemu/pool.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,12 +46,14 @@ func GetOrCreate(socketPath string) (*QEMU, error) {
4646

4747
// Remove closes and removes a client from the pool.
4848
// Called automatically on errors to allow fresh reconnection.
49+
// Close is done asynchronously to avoid blocking if the connection is in a bad state.
4950
func Remove(socketPath string) {
5051
clientPool.Lock()
5152
defer clientPool.Unlock()
5253

5354
if client, ok := clientPool.clients[socketPath]; ok {
54-
client.client.Close()
5555
delete(clientPool.clients, socketPath)
56+
// Close asynchronously to avoid blocking on stuck connections
57+
go client.client.Close()
5658
}
5759
}

lib/hypervisor/qemu/vsock.go

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package qemu
33
import (
44
"context"
55
"fmt"
6+
"io"
67
"log/slog"
78
"net"
89
"time"
@@ -158,7 +159,16 @@ func newVsockConn(fd int, remoteCID, remotePort uint32) (*vsockConn, error) {
158159
}
159160

160161
func (c *vsockConn) Read(b []byte) (int, error) {
161-
return unix.Read(c.fd, b)
162+
n, err := unix.Read(c.fd, b)
163+
// Ensure we never return negative n (violates io.Reader contract)
164+
// This can happen when the vsock fd becomes invalid (VM died)
165+
if n < 0 {
166+
if err == nil {
167+
err = io.EOF
168+
}
169+
return 0, err
170+
}
171+
return n, err
162172
}
163173

164174
func (c *vsockConn) Write(b []byte) (int, error) {

lib/instances/delete.go

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ import (
77
"syscall"
88
"time"
99

10+
"github.com/onkernel/hypeman/lib/exec"
11+
"github.com/onkernel/hypeman/lib/hypervisor"
1012
"github.com/onkernel/hypeman/lib/logger"
1113
"github.com/onkernel/hypeman/lib/network"
1214
)
@@ -39,7 +41,12 @@ func (m *manager) deleteInstance(
3941
}
4042
}
4143

42-
// 3. If hypervisor might be running, force kill it
44+
// 3. Close exec gRPC connection before killing hypervisor to prevent panic
45+
if dialer, err := hypervisor.NewVsockDialer(inst.HypervisorType, inst.VsockSocket, inst.VsockCID); err == nil {
46+
exec.CloseConn(dialer.Key())
47+
}
48+
49+
// 4. If hypervisor might be running, force kill it
4350
// Also attempt kill for StateUnknown since we can't be sure if hypervisor is running
4451
if inst.State.RequiresVMM() || inst.State == StateUnknown {
4552
log.DebugContext(ctx, "stopping hypervisor", "instance_id", id, "state", inst.State)
@@ -50,7 +57,7 @@ func (m *manager) deleteInstance(
5057
}
5158
}
5259

53-
// 4. Release network allocation
60+
// 5. Release network allocation
5461
if inst.NetworkEnabled {
5562
log.DebugContext(ctx, "releasing network", "instance_id", id, "network", "default")
5663
if err := m.networkManager.ReleaseAllocation(ctx, networkAlloc); err != nil {
@@ -59,7 +66,7 @@ func (m *manager) deleteInstance(
5966
}
6067
}
6168

62-
// 5. Detach and auto-unbind devices from VFIO
69+
// 6. Detach and auto-unbind devices from VFIO
6370
if len(inst.Devices) > 0 && m.deviceManager != nil {
6471
for _, deviceID := range inst.Devices {
6572
log.DebugContext(ctx, "detaching device", "id", id, "device", deviceID)
@@ -76,7 +83,7 @@ func (m *manager) deleteInstance(
7683
}
7784
}
7885

79-
// 5b. Detach volumes
86+
// 6b. Detach volumes
8087
if len(inst.Volumes) > 0 {
8188
log.DebugContext(ctx, "detaching volumes", "instance_id", id, "count", len(inst.Volumes))
8289
for _, volAttach := range inst.Volumes {
@@ -87,7 +94,7 @@ func (m *manager) deleteInstance(
8794
}
8895
}
8996

90-
// 6. Delete all instance data
97+
// 7. Delete all instance data
9198
log.DebugContext(ctx, "deleting instance data", "instance_id", id)
9299
if err := m.deleteInstanceData(id); err != nil {
93100
log.ErrorContext(ctx, "failed to delete instance data", "instance_id", id, "error", err)

lib/network/derive.go

Lines changed: 29 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ import (
99

1010
"github.com/onkernel/hypeman/lib/hypervisor"
1111
"github.com/onkernel/hypeman/lib/logger"
12-
"github.com/onkernel/hypeman/lib/vmm"
1312
)
1413

1514
// instanceMetadata is the minimal metadata we need to derive allocations
@@ -18,6 +17,8 @@ type instanceMetadata struct {
1817
Name string
1918
NetworkEnabled bool
2019
HypervisorType string
20+
IP string // Assigned IP address
21+
MAC string // Assigned MAC address
2122
}
2223

2324
// deriveAllocation derives network allocation from CH or snapshot
@@ -49,58 +50,38 @@ func (m *manager) deriveAllocation(ctx context.Context, instanceID string) (*All
4950
}
5051
netmask := fmt.Sprintf("%d.%d.%d.%d", ipNet.Mask[0], ipNet.Mask[1], ipNet.Mask[2], ipNet.Mask[3])
5152

52-
// 4. Try to derive from running VM first
53-
socketPath := m.paths.InstanceSocket(instanceID, hypervisor.SocketNameForType(hypervisor.Type(meta.HypervisorType)))
54-
if fileExists(socketPath) {
55-
client, err := vmm.NewVMM(socketPath)
56-
if err == nil {
57-
resp, err := client.GetVmInfoWithResponse(ctx)
58-
if err == nil && resp.JSON200 != nil && resp.JSON200.Config.Net != nil && len(*resp.JSON200.Config.Net) > 0 {
59-
nets := *resp.JSON200.Config.Net
60-
net := nets[0]
61-
if net.Ip != nil && net.Mac != nil && net.Tap != nil {
62-
log.DebugContext(ctx, "derived allocation from running VM", "instance_id", instanceID)
63-
return &Allocation{
64-
InstanceID: instanceID,
65-
InstanceName: meta.Name,
66-
Network: "default",
67-
IP: *net.Ip,
68-
MAC: *net.Mac,
69-
TAPDevice: *net.Tap,
70-
Gateway: defaultNet.Gateway,
71-
Netmask: netmask,
72-
State: "running",
73-
}, nil
74-
}
53+
// 4. Use stored metadata to derive allocation (works for all hypervisors)
54+
if meta.IP != "" && meta.MAC != "" {
55+
tap := generateTAPName(instanceID)
56+
57+
// Determine state based on socket existence and snapshot
58+
socketPath := m.paths.InstanceSocket(instanceID, hypervisor.SocketNameForType(hypervisor.Type(meta.HypervisorType)))
59+
state := "stopped"
60+
if fileExists(socketPath) {
61+
state = "running"
62+
} else {
63+
// Check for snapshot (standby state)
64+
snapshotConfigJson := m.paths.InstanceSnapshotConfig(instanceID)
65+
if fileExists(snapshotConfigJson) {
66+
state = "standby"
7567
}
7668
}
77-
}
7869

79-
// 5. Try to derive from snapshot
80-
// Cloud Hypervisor creates config.json in the snapshot directory
81-
snapshotConfigJson := m.paths.InstanceSnapshotConfig(instanceID)
82-
if fileExists(snapshotConfigJson) {
83-
vmConfig, err := m.parseVmJson(snapshotConfigJson)
84-
if err == nil && vmConfig.Net != nil && len(*vmConfig.Net) > 0 {
85-
nets := *vmConfig.Net
86-
if nets[0].Ip != nil && nets[0].Mac != nil && nets[0].Tap != nil {
87-
log.DebugContext(ctx, "derived allocation from snapshot", "instance_id", instanceID)
88-
return &Allocation{
89-
InstanceID: instanceID,
90-
InstanceName: meta.Name,
91-
Network: "default",
92-
IP: *nets[0].Ip,
93-
MAC: *nets[0].Mac,
94-
TAPDevice: *nets[0].Tap,
95-
Gateway: defaultNet.Gateway,
96-
Netmask: netmask,
97-
State: "standby",
98-
}, nil
99-
}
100-
}
70+
log.DebugContext(ctx, "derived allocation from metadata", "instance_id", instanceID, "state", state)
71+
return &Allocation{
72+
InstanceID: instanceID,
73+
InstanceName: meta.Name,
74+
Network: "default",
75+
IP: meta.IP,
76+
MAC: meta.MAC,
77+
TAPDevice: tap,
78+
Gateway: defaultNet.Gateway,
79+
Netmask: netmask,
80+
State: state,
81+
}, nil
10182
}
10283

103-
// 6. No allocation (stopped or network not yet configured)
84+
// 5. No allocation (network not yet configured)
10485
return nil, nil
10586
}
10687

@@ -164,22 +145,6 @@ func (m *manager) loadInstanceMetadata(instanceID string) (*instanceMetadata, er
164145
return &meta, nil
165146
}
166147

167-
// parseVmJson parses Cloud Hypervisor's config.json from snapshot
168-
// Note: Despite the function name, this parses config.json (what CH actually creates)
169-
func (m *manager) parseVmJson(path string) (*vmm.VmConfig, error) {
170-
data, err := os.ReadFile(path)
171-
if err != nil {
172-
return nil, fmt.Errorf("read config.json: %w", err)
173-
}
174-
175-
var vmConfig vmm.VmConfig
176-
if err := json.Unmarshal(data, &vmConfig); err != nil {
177-
return nil, fmt.Errorf("unmarshal config.json: %w", err)
178-
}
179-
180-
return &vmConfig, nil
181-
}
182-
183148
// fileExists checks if a file exists
184149
func fileExists(path string) bool {
185150
_, err := os.Stat(path)

0 commit comments

Comments
 (0)