Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 52 additions & 6 deletions cmd/api/api/api_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,17 @@ import (
"os"
"syscall"
"testing"
"time"

"github.com/onkernel/hypeman/cmd/api/config"
"github.com/onkernel/hypeman/lib/images"
"github.com/onkernel/hypeman/lib/instances"
"github.com/onkernel/hypeman/lib/network"
"github.com/onkernel/hypeman/lib/oapi"
"github.com/onkernel/hypeman/lib/paths"
"github.com/onkernel/hypeman/lib/system"
"github.com/onkernel/hypeman/lib/volumes"
"github.com/stretchr/testify/require"
)

// newTestService creates an ApiService for testing with automatic cleanup
Expand Down Expand Up @@ -51,35 +54,35 @@ func newTestService(t *testing.T) *ApiService {
func cleanupOrphanedProcesses(t *testing.T, dataDir string) {
p := paths.New(dataDir)
guestsDir := p.GuestsDir()

entries, err := os.ReadDir(guestsDir)
if err != nil {
return // No guests directory
}

for _, entry := range entries {
if !entry.IsDir() {
continue
}

metaPath := p.InstanceMetadata(entry.Name())
data, err := os.ReadFile(metaPath)
if err != nil {
continue
}

// Parse just the CHPID field
var meta struct {
CHPID *int `json:"CHPID"`
}
if err := json.Unmarshal(data, &meta); err != nil {
continue
}

// If metadata has a PID, try to kill it
if meta.CHPID != nil {
pid := *meta.CHPID

// Check if process exists
if err := syscall.Kill(pid, 0); err == nil {
t.Logf("Cleaning up orphaned Cloud Hypervisor process: PID %d", pid)
Expand All @@ -92,3 +95,46 @@ func cleanupOrphanedProcesses(t *testing.T, dataDir string) {
func ctx() context.Context {
return context.Background()
}

// createAndWaitForImage creates an image and waits for it to be ready.
// Returns the image name on success, or fails the test on error/timeout.
func createAndWaitForImage(t *testing.T, svc *ApiService, imageName string, timeout time.Duration) string {
t.Helper()

t.Logf("Creating image %s...", imageName)
imgResp, err := svc.CreateImage(ctx(), oapi.CreateImageRequestObject{
Body: &oapi.CreateImageRequest{
Name: imageName,
},
})
require.NoError(t, err)

imgCreated, ok := imgResp.(oapi.CreateImage202JSONResponse)
require.True(t, ok, "expected 202 response for image creation")

t.Log("Waiting for image to be ready...")
deadline := time.Now().Add(timeout)
for time.Now().Before(deadline) {
imgResp, err := svc.GetImage(ctx(), oapi.GetImageRequestObject{
Name: imageName,
})
require.NoError(t, err)

img, ok := imgResp.(oapi.GetImage200JSONResponse)
if ok {
switch img.Status {
case "ready":
t.Log("Image is ready")
return imgCreated.Name
case "failed":
t.Fatalf("Image build failed: %v", img.Error)
default:
t.Logf("Image status: %s", img.Status)
}
}
time.Sleep(1 * time.Second)
}

t.Fatalf("Timeout waiting for image %s to be ready", imageName)
return ""
}
200 changes: 147 additions & 53 deletions cmd/api/api/exec_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,58 +34,20 @@ func TestExecInstanceNonTTY(t *testing.T) {
require.NoError(t, err)
t.Log("System files ready")

// First, create and wait for the image to be ready
// Use nginx which has a proper long-running process
t.Log("Creating nginx:alpine image...")
imgResp, err := svc.CreateImage(ctx(), oapi.CreateImageRequestObject{
Body: &oapi.CreateImageRequest{
Name: "docker.io/library/nginx:alpine",
},
})
require.NoError(t, err)
imgCreated, ok := imgResp.(oapi.CreateImage202JSONResponse)
require.True(t, ok, "expected 202 response")
assert.Equal(t, "docker.io/library/nginx:alpine", imgCreated.Name)

// Wait for image to be ready (poll with timeout)
t.Log("Waiting for image to be ready...")
timeout := time.After(30 * time.Second)
ticker := time.NewTicker(1 * time.Second)
defer ticker.Stop()

imageReady := false
for !imageReady {
select {
case <-timeout:
t.Fatal("Timeout waiting for image to be ready")
case <-ticker.C:
imgResp, err := svc.GetImage(ctx(), oapi.GetImageRequestObject{
Name: "docker.io/library/nginx:alpine",
})
require.NoError(t, err)

img, ok := imgResp.(oapi.GetImage200JSONResponse)
if ok && img.Status == "ready" {
imageReady = true
t.Log("Image is ready")
} else if ok {
t.Logf("Image status: %s", img.Status)
}
}
}
// Create and wait for nginx image (has a proper long-running process)
createAndWaitForImage(t, svc, "docker.io/library/nginx:alpine", 30*time.Second)

// Create instance
t.Log("Creating instance...")
networkDisabled := false
networkEnabled := false
instResp, err := svc.CreateInstance(ctx(), oapi.CreateInstanceRequestObject{
Body: &oapi.CreateInstanceRequest{
Name: "exec-test",
Image: "docker.io/library/nginx:alpine",
Network: &struct {
Enabled *bool `json:"enabled,omitempty"`
Name *string `json:"name,omitempty"`
Enabled *bool `json:"enabled,omitempty"`
}{
Enabled: &networkDisabled,
Enabled: &networkEnabled,
},
},
})
Expand All @@ -108,8 +70,8 @@ func TestExecInstanceNonTTY(t *testing.T) {
case <-nginxTimeout:
t.Fatal("Timeout waiting for nginx to start")
case <-nginxTicker.C:
logs, err := svc.InstanceManager.GetInstanceLogs(ctx(), inst.Id, false, 100)
if err == nil && strings.Contains(logs, "start worker processes") {
logs := collectTestLogs(t, svc, inst.Id, 100)
if strings.Contains(logs, "start worker processes") {
nginxReady = true
t.Log("Nginx is ready")
}
Expand All @@ -132,7 +94,7 @@ func TestExecInstanceNonTTY(t *testing.T) {
consolePath := paths.New(svc.Config.DataDir).InstanceConsoleLog(inst.Id)
if consoleData, err := os.ReadFile(consolePath); err == nil {
lines := strings.Split(string(consoleData), "\n")

// Print exec-agent specific logs
t.Logf("=== Exec Agent Logs ===")
for _, line := range lines {
Expand All @@ -155,40 +117,39 @@ func TestExecInstanceNonTTY(t *testing.T) {
var exit *exec.ExitStatus
var stdout, stderr outputBuffer
var execErr error

t.Log("Testing exec command: whoami")
maxRetries := 10
for i := 0; i < maxRetries; i++ {
stdout = outputBuffer{}
stderr = outputBuffer{}

exit, execErr = exec.ExecIntoInstance(ctx(), actualInst.VsockSocket, exec.ExecOptions{
Command: []string{"/bin/sh", "-c", "whoami"},
Stdin: nil,
Stdout: &stdout,
Stderr: &stderr,
TTY: false,
})

if execErr == nil {
break
}

t.Logf("Exec attempt %d/%d failed, retrying: %v", i+1, maxRetries, execErr)
time.Sleep(1 * time.Second)
}

// Assert exec worked
require.NoError(t, execErr, "exec should succeed after retries")
require.NotNil(t, exit, "exit status should be returned")
require.Equal(t, 0, exit.Code, "whoami should exit with code 0")


// Verify output
outStr := stdout.String()
t.Logf("Command output: %q", outStr)
require.Contains(t, outStr, "root", "whoami should return root user")

// Cleanup
t.Log("Cleaning up instance...")
delResp, err := svc.DeleteInstance(ctx(), oapi.DeleteInstanceRequestObject{
Expand All @@ -199,6 +160,139 @@ func TestExecInstanceNonTTY(t *testing.T) {
require.True(t, ok, "expected 204 response")
}

// TestExecWithDebianMinimal tests exec with a minimal Debian image.
// This test specifically catches issues that wouldn't appear with Alpine-based images:
// 1. Debian's default entrypoint (bash) exits immediately without a TTY
// 2. exec-agent must keep running even after the main app exits
// 3. The VM must not kernel panic when the entrypoint exits
func TestExecWithDebianMinimal(t *testing.T) {
// Require KVM access for VM creation
if _, err := os.Stat("/dev/kvm"); os.IsNotExist(err) {
t.Fatal("/dev/kvm not available - ensure KVM is enabled and user is in 'kvm' group (sudo usermod -aG kvm $USER)")
}

if testing.Short() {
t.Skip("Skipping integration test in short mode")
}

svc := newTestService(t)

// Ensure system files (kernel and initrd) are available
t.Log("Ensuring system files...")
systemMgr := system.NewManager(paths.New(svc.Config.DataDir))
err := systemMgr.EnsureSystemFiles(ctx())
require.NoError(t, err)
t.Log("System files ready")

// Create Debian 12 slim image (minimal, no iproute2)
createAndWaitForImage(t, svc, "docker.io/library/debian:12-slim", 60*time.Second)

// Create instance (network disabled in test environment)
t.Log("Creating Debian instance...")
networkEnabled := false
instResp, err := svc.CreateInstance(ctx(), oapi.CreateInstanceRequestObject{
Body: &oapi.CreateInstanceRequest{
Name: "debian-exec-test",
Image: "docker.io/library/debian:12-slim",
Network: &struct {
Enabled *bool `json:"enabled,omitempty"`
}{
Enabled: &networkEnabled,
},
},
})
require.NoError(t, err)

inst, ok := instResp.(oapi.CreateInstance201JSONResponse)
require.True(t, ok, "expected 201 response")
require.NotEmpty(t, inst.Id)
t.Logf("Instance created: %s", inst.Id)

// Cleanup on exit
t.Cleanup(func() {
t.Log("Cleaning up instance...")
svc.DeleteInstance(ctx(), oapi.DeleteInstanceRequestObject{Id: inst.Id})
})

// Get actual instance to access vsock fields
actualInst, err := svc.InstanceManager.GetInstance(ctx(), inst.Id)
require.NoError(t, err)
require.NotNil(t, actualInst)

// Wait for exec-agent to be ready by checking logs
// This is the key difference: we wait for exec-agent, not the app (which exits immediately)
t.Log("Waiting for exec-agent to start...")
execAgentReady := false
agentTimeout := time.After(15 * time.Second)
agentTicker := time.NewTicker(500 * time.Millisecond)
defer agentTicker.Stop()

var logs string
for !execAgentReady {
select {
case <-agentTimeout:
// Dump logs on failure for debugging
logs = collectTestLogs(t, svc, inst.Id, 200)
t.Logf("Console logs:\n%s", logs)
t.Fatal("Timeout waiting for exec-agent to start")
case <-agentTicker.C:
logs = collectTestLogs(t, svc, inst.Id, 100)
if strings.Contains(logs, "[exec-agent] listening on vsock port 2222") {
execAgentReady = true
t.Log("exec-agent is ready")
}
}
}

// Verify the app exited but VM is still usable (key behavior this test validates)
logs = collectTestLogs(t, svc, inst.Id, 200)
assert.Contains(t, logs, "overlay-init: app exited with code", "App should have exited")

// Test exec commands work even though the main app (bash) has exited
t.Log("Testing exec command: echo")
var stdout, stderr outputBuffer
exit, err := exec.ExecIntoInstance(ctx(), actualInst.VsockSocket, exec.ExecOptions{
Command: []string{"echo", "hello from debian"},
Stdout: &stdout,
Stderr: &stderr,
TTY: false,
})
require.NoError(t, err, "exec should succeed")
require.NotNil(t, exit)
require.Equal(t, 0, exit.Code, "echo should exit with code 0")
assert.Contains(t, stdout.String(), "hello from debian")

// Verify we're actually in Debian
t.Log("Verifying OS release...")
stdout = outputBuffer{}
exit, err = exec.ExecIntoInstance(ctx(), actualInst.VsockSocket, exec.ExecOptions{
Command: []string{"cat", "/etc/os-release"},
Stdout: &stdout,
TTY: false,
})
require.NoError(t, err)
require.Equal(t, 0, exit.Code)
assert.Contains(t, stdout.String(), "Debian", "Should be running Debian")
assert.Contains(t, stdout.String(), "bookworm", "Should be Debian 12 (bookworm)")
t.Logf("OS: %s", strings.Split(stdout.String(), "\n")[0])

}

// collectTestLogs collects logs from an instance (non-streaming)
func collectTestLogs(t *testing.T, svc *ApiService, instanceID string, n int) string {
logChan, err := svc.InstanceManager.StreamInstanceLogs(ctx(), instanceID, n, false)
if err != nil {
return ""
}

var lines []string
for line := range logChan {
lines = append(lines, line)
}

return strings.Join(lines, "\n")
}

// outputBuffer is a simple buffer for capturing exec output
type outputBuffer struct {
buf bytes.Buffer
Expand Down
Loading
Loading