Skip to content

Commit 6526969

Browse files
committed
Add vsock dialer abstraction
1 parent 6948c1f commit 6526969

23 files changed

+1252
-396
lines changed

cmd/api/api/exec.go

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212

1313
"github.com/gorilla/websocket"
1414
"github.com/onkernel/hypeman/lib/exec"
15+
"github.com/onkernel/hypeman/lib/hypervisor"
1516
"github.com/onkernel/hypeman/lib/instances"
1617
"github.com/onkernel/hypeman/lib/logger"
1718
mw "github.com/onkernel/hypeman/lib/middleware"
@@ -110,8 +111,17 @@ func (s *ApiService) ExecHandler(w http.ResponseWriter, r *http.Request) {
110111
// Create WebSocket read/writer wrapper
111112
wsConn := &wsReadWriter{ws: ws, ctx: ctx}
112113

114+
// Create vsock dialer for this hypervisor type
115+
dialer, err := hypervisor.NewVsockDialer(hypervisor.Type(inst.HypervisorType), inst.VsockSocket, inst.VsockCID)
116+
if err != nil {
117+
log.ErrorContext(ctx, "failed to create vsock dialer", "error", err)
118+
ws.WriteMessage(websocket.BinaryMessage, []byte(fmt.Sprintf("Error: %v\r\n", err)))
119+
ws.WriteMessage(websocket.TextMessage, []byte(`{"exitCode":127}`))
120+
return
121+
}
122+
113123
// Execute via vsock
114-
exit, err := exec.ExecIntoInstance(ctx, inst.VsockSocket, exec.ExecOptions{
124+
exit, err := exec.ExecIntoInstance(ctx, dialer, exec.ExecOptions{
115125
Command: execReq.Command,
116126
Stdin: wsConn,
117127
Stdout: wsConn,

cmd/api/api/exec_test.go

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"time"
99

1010
"github.com/onkernel/hypeman/lib/exec"
11+
"github.com/onkernel/hypeman/lib/hypervisor"
1112
"github.com/onkernel/hypeman/lib/instances"
1213
"github.com/onkernel/hypeman/lib/oapi"
1314
"github.com/onkernel/hypeman/lib/paths"
@@ -119,13 +120,16 @@ func TestExecInstanceNonTTY(t *testing.T) {
119120
var stdout, stderr outputBuffer
120121
var execErr error
121122

123+
dialer, err := hypervisor.NewVsockDialer(actualInst.HypervisorType, actualInst.VsockSocket, actualInst.VsockCID)
124+
require.NoError(t, err)
125+
122126
t.Log("Testing exec command: whoami")
123127
maxRetries := 10
124128
for i := 0; i < maxRetries; i++ {
125129
stdout = outputBuffer{}
126130
stderr = outputBuffer{}
127131

128-
exit, execErr = exec.ExecIntoInstance(ctx(), actualInst.VsockSocket, exec.ExecOptions{
132+
exit, execErr = exec.ExecIntoInstance(ctx(), dialer, exec.ExecOptions{
129133
Command: []string{"/bin/sh", "-c", "whoami"},
130134
Stdin: nil,
131135
Stdout: &stdout,
@@ -250,9 +254,12 @@ func TestExecWithDebianMinimal(t *testing.T) {
250254
assert.Contains(t, logs, "overlay-init: app exited with code", "App should have exited")
251255

252256
// Test exec commands work even though the main app (bash) has exited
257+
dialer2, err := hypervisor.NewVsockDialer(actualInst.HypervisorType, actualInst.VsockSocket, actualInst.VsockCID)
258+
require.NoError(t, err)
259+
253260
t.Log("Testing exec command: echo")
254261
var stdout, stderr outputBuffer
255-
exit, err := exec.ExecIntoInstance(ctx(), actualInst.VsockSocket, exec.ExecOptions{
262+
exit, err := exec.ExecIntoInstance(ctx(), dialer2, exec.ExecOptions{
256263
Command: []string{"echo", "hello from debian"},
257264
Stdout: &stdout,
258265
Stderr: &stderr,
@@ -266,7 +273,7 @@ func TestExecWithDebianMinimal(t *testing.T) {
266273
// Verify we're actually in Debian
267274
t.Log("Verifying OS release...")
268275
stdout = outputBuffer{}
269-
exit, err = exec.ExecIntoInstance(ctx(), actualInst.VsockSocket, exec.ExecOptions{
276+
exit, err = exec.ExecIntoInstance(ctx(), dialer2, exec.ExecOptions{
270277
Command: []string{"cat", "/etc/os-release"},
271278
Stdout: &stdout,
272279
TTY: false,

lib/devices/gpu_e2e_test.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212
"github.com/onkernel/hypeman/cmd/api/config"
1313
"github.com/onkernel/hypeman/lib/devices"
1414
"github.com/onkernel/hypeman/lib/exec"
15+
"github.com/onkernel/hypeman/lib/hypervisor"
1516
"github.com/onkernel/hypeman/lib/images"
1617
"github.com/onkernel/hypeman/lib/instances"
1718
"github.com/onkernel/hypeman/lib/network"
@@ -218,6 +219,9 @@ func TestGPUPassthrough(t *testing.T) {
218219
actualInst, err := instanceMgr.GetInstance(ctx, inst.Id)
219220
require.NoError(t, err)
220221

222+
dialer, err := hypervisor.NewVsockDialer(actualInst.HypervisorType, actualInst.VsockSocket, actualInst.VsockCID)
223+
require.NoError(t, err)
224+
221225
// Create a context with timeout for exec operations
222226
execCtx, cancel := context.WithTimeout(ctx, 30*time.Second)
223227
defer cancel()
@@ -232,7 +236,7 @@ func TestGPUPassthrough(t *testing.T) {
232236
stdout = outputBuffer{}
233237
stderr = outputBuffer{}
234238

235-
_, execErr = exec.ExecIntoInstance(execCtx, actualInst.VsockSocket, exec.ExecOptions{
239+
_, execErr = exec.ExecIntoInstance(execCtx, dialer, exec.ExecOptions{
236240
Command: []string{"/bin/sh", "-c", checkGPUCmd},
237241
Stdin: nil,
238242
Stdout: &stdout,

lib/devices/gpu_inference_test.go

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"github.com/onkernel/hypeman/cmd/api/config"
2424
"github.com/onkernel/hypeman/lib/devices"
2525
"github.com/onkernel/hypeman/lib/exec"
26+
"github.com/onkernel/hypeman/lib/hypervisor"
2627
"github.com/onkernel/hypeman/lib/images"
2728
"github.com/onkernel/hypeman/lib/instances"
2829
"github.com/onkernel/hypeman/lib/network"
@@ -285,14 +286,17 @@ func TestGPUInference(t *testing.T) {
285286
actualInst, err := instanceMgr.GetInstance(ctx, inst.Id)
286287
require.NoError(t, err)
287288

289+
dialer, err := hypervisor.NewVsockDialer(actualInst.HypervisorType, actualInst.VsockSocket, actualInst.VsockCID)
290+
require.NoError(t, err)
291+
288292
// Step 10: Wait for Ollama server
289293
t.Log("Step 10: Waiting for Ollama server to be ready...")
290294
ollamaReady := false
291295
for i := 0; i < 60; i++ { // 60 seconds for CUDA init
292296
healthCtx, healthCancel := context.WithTimeout(ctx, 5*time.Second)
293297
var healthStdout, healthStderr inferenceOutputBuffer
294298

295-
_, err = exec.ExecIntoInstance(healthCtx, actualInst.VsockSocket, exec.ExecOptions{
299+
_, err = exec.ExecIntoInstance(healthCtx, dialer, exec.ExecOptions{
296300
Command: []string{"/bin/sh", "-c", "ollama list 2>&1"},
297301
Stdout: &healthStdout,
298302
Stderr: &healthStderr,
@@ -319,7 +323,7 @@ func TestGPUInference(t *testing.T) {
319323

320324
// Check nvidia-smi (should work now with CUDA image)
321325
var nvidiaSmiStdout, nvidiaSmiStderr inferenceOutputBuffer
322-
_, _ = exec.ExecIntoInstance(gpuCheckCtx, actualInst.VsockSocket, exec.ExecOptions{
326+
_, _ = exec.ExecIntoInstance(gpuCheckCtx, dialer, exec.ExecOptions{
323327
Command: []string{"/bin/sh", "-c", "nvidia-smi 2>&1 || echo 'nvidia-smi failed'"},
324328
Stdout: &nvidiaSmiStdout,
325329
Stderr: &nvidiaSmiStderr,
@@ -333,7 +337,7 @@ func TestGPUInference(t *testing.T) {
333337

334338
// Check NVIDIA kernel modules
335339
var modulesStdout inferenceOutputBuffer
336-
exec.ExecIntoInstance(gpuCheckCtx, actualInst.VsockSocket, exec.ExecOptions{
340+
exec.ExecIntoInstance(gpuCheckCtx, dialer, exec.ExecOptions{
337341
Command: []string{"/bin/sh", "-c", "cat /proc/modules | grep nvidia"},
338342
Stdout: &modulesStdout,
339343
})
@@ -343,7 +347,7 @@ func TestGPUInference(t *testing.T) {
343347

344348
// Check device nodes
345349
var devStdout inferenceOutputBuffer
346-
exec.ExecIntoInstance(gpuCheckCtx, actualInst.VsockSocket, exec.ExecOptions{
350+
exec.ExecIntoInstance(gpuCheckCtx, dialer, exec.ExecOptions{
347351
Command: []string{"/bin/sh", "-c", "ls -la /dev/nvidia* 2>&1"},
348352
Stdout: &devStdout,
349353
})
@@ -355,7 +359,7 @@ func TestGPUInference(t *testing.T) {
355359
t.Log("Step 12: Ensuring TinyLlama model is available...")
356360

357361
var listStdout inferenceOutputBuffer
358-
exec.ExecIntoInstance(gpuCheckCtx, actualInst.VsockSocket, exec.ExecOptions{
362+
exec.ExecIntoInstance(gpuCheckCtx, dialer, exec.ExecOptions{
359363
Command: []string{"/bin/sh", "-c", "ollama list 2>&1"},
360364
Stdout: &listStdout,
361365
})
@@ -366,7 +370,7 @@ func TestGPUInference(t *testing.T) {
366370
defer pullCancel()
367371

368372
var pullStdout inferenceOutputBuffer
369-
_, pullErr := exec.ExecIntoInstance(pullCtx, actualInst.VsockSocket, exec.ExecOptions{
373+
_, pullErr := exec.ExecIntoInstance(pullCtx, dialer, exec.ExecOptions{
370374
Command: []string{"/bin/sh", "-c", "ollama pull tinyllama 2>&1"},
371375
Stdout: &pullStdout,
372376
})

lib/devices/gpu_module_test.go

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import (
1919
"github.com/onkernel/hypeman/cmd/api/config"
2020
"github.com/onkernel/hypeman/lib/devices"
2121
"github.com/onkernel/hypeman/lib/exec"
22+
"github.com/onkernel/hypeman/lib/hypervisor"
2223
"github.com/onkernel/hypeman/lib/images"
2324
"github.com/onkernel/hypeman/lib/instances"
2425
"github.com/onkernel/hypeman/lib/network"
@@ -194,6 +195,9 @@ func TestNVIDIAModuleLoading(t *testing.T) {
194195
actualInst, err := instanceMgr.GetInstance(ctx, inst.Id)
195196
require.NoError(t, err)
196197

198+
dialer, err := hypervisor.NewVsockDialer(actualInst.HypervisorType, actualInst.VsockSocket, actualInst.VsockCID)
199+
require.NoError(t, err)
200+
197201
execCtx, cancel := context.WithTimeout(ctx, 30*time.Second)
198202
defer cancel()
199203

@@ -204,7 +208,7 @@ func TestNVIDIAModuleLoading(t *testing.T) {
204208
for i := 0; i < 10; i++ {
205209
stdout = outputBuffer{}
206210
stderr = outputBuffer{}
207-
_, err = exec.ExecIntoInstance(execCtx, actualInst.VsockSocket, exec.ExecOptions{
211+
_, err = exec.ExecIntoInstance(execCtx, dialer, exec.ExecOptions{
208212
Command: []string{"/bin/sh", "-c", dmesgCmd},
209213
Stdin: nil,
210214
Stdout: &stdout,
@@ -234,7 +238,7 @@ func TestNVIDIAModuleLoading(t *testing.T) {
234238
// Check lsmod for nvidia modules
235239
stdout = outputBuffer{}
236240
stderr = outputBuffer{}
237-
_, err = exec.ExecIntoInstance(execCtx, actualInst.VsockSocket, exec.ExecOptions{
241+
_, err = exec.ExecIntoInstance(execCtx, dialer, exec.ExecOptions{
238242
Command: []string{"/bin/sh", "-c", "cat /proc/modules | grep nvidia || echo 'No nvidia modules loaded'"},
239243
Stdin: nil,
240244
Stdout: &stdout,
@@ -254,7 +258,7 @@ func TestNVIDIAModuleLoading(t *testing.T) {
254258
// Check for /dev/nvidia* devices
255259
stdout = outputBuffer{}
256260
stderr = outputBuffer{}
257-
_, err = exec.ExecIntoInstance(execCtx, actualInst.VsockSocket, exec.ExecOptions{
261+
_, err = exec.ExecIntoInstance(execCtx, dialer, exec.ExecOptions{
258262
Command: []string{"/bin/sh", "-c", "ls -la /dev/nvidia* 2>&1 || echo 'No nvidia devices found'"},
259263
Stdin: nil,
260264
Stdout: &stdout,
@@ -430,13 +434,16 @@ func TestNVMLDetection(t *testing.T) {
430434
actualInst, err := instanceMgr.GetInstance(ctx, inst.Id)
431435
require.NoError(t, err)
432436

437+
dialer2, err := hypervisor.NewVsockDialer(actualInst.HypervisorType, actualInst.VsockSocket, actualInst.VsockCID)
438+
require.NoError(t, err)
439+
433440
// Step 5: Run NVML test
434441
t.Log("Step 5: Running NVML detection test...")
435442
execCtx, cancel := context.WithTimeout(ctx, 30*time.Second)
436443
defer cancel()
437444

438445
var stdout, stderr outputBuffer
439-
_, err = exec.ExecIntoInstance(execCtx, actualInst.VsockSocket, exec.ExecOptions{
446+
_, err = exec.ExecIntoInstance(execCtx, dialer2, exec.ExecOptions{
440447
Command: []string{"/bin/sh", "-c", "python3 /usr/local/bin/test-nvml.py 2>&1"},
441448
Stdin: nil,
442449
Stdout: &stdout,
@@ -469,7 +476,7 @@ func TestNVMLDetection(t *testing.T) {
469476
t.Log("Step 6: Running CUDA driver test...")
470477
stdout = outputBuffer{}
471478
stderr = outputBuffer{}
472-
_, err = exec.ExecIntoInstance(execCtx, actualInst.VsockSocket, exec.ExecOptions{
479+
_, err = exec.ExecIntoInstance(execCtx, dialer2, exec.ExecOptions{
473480
Command: []string{"/bin/sh", "-c", "python3 /usr/local/bin/test-cuda.py 2>&1"},
474481
Stdin: nil,
475482
Stdout: &stdout,

lib/exec/README.md

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ API Server (/instances/{id}/exec)
1111
1212
lib/exec/client.go (ExecIntoInstance)
1313
14-
Cloud Hypervisor vsock socket
14+
Hypervisor vsock (CH: Unix socket, QEMU: AF_VSOCK)
1515
1616
Guest: exec-agent (lib/system/exec_agent)
1717
@@ -37,14 +37,13 @@ Container (chroot /overlay/newroot)
3737
"timeout": 30 // optional: timeout in seconds
3838
}
3939
```
40-
- Calls `exec.ExecIntoInstance()` with the instance's vsock socket path
40+
- Creates a `VsockDialer` for the instance's hypervisor type and calls `exec.ExecIntoInstance()`
4141
- Logs audit trail: JWT subject, instance ID, command, start/end time, exit code
4242

4343
### 2. Client (`lib/exec/client.go`)
4444

45-
- **ExecIntoInstance()**: Main client function
46-
- Connects to Cloud Hypervisor's vsock Unix socket
47-
- Performs vsock handshake: `CONNECT 2222\n``OK <cid>`
45+
- **ExecIntoInstance()**: Main client function, takes a `VsockDialer` interface
46+
- Uses hypervisor-specific dialer to connect to guest (see `lib/hypervisor/*/vsock.go`)
4847
- Creates gRPC client over the vsock connection (pooled per VM for efficiency)
4948
- Streams stdin/stdout/stderr bidirectionally
5049
- Returns exit status when command completes

0 commit comments

Comments
 (0)