Skip to content

Commit 1d67894

Browse files
committed
tweaks
1 parent bb45dcb commit 1d67894

File tree

8 files changed

+101
-87
lines changed

8 files changed

+101
-87
lines changed

.cursor/plans/systemd_vm_with_go_init_c0a9c010.plan.md

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,8 @@ flowchart TB
9292
end
9393
```
9494

95+
96+
9597
## Shared vs Mode-Specific Behavior
9698

9799
| Step | Exec Mode | Systemd Mode ||------|-----------|--------------|| Mount proc/sys/dev | Shared | Shared || Mount rootfs overlay | Shared | Shared || Read config disk | Shared | Shared || Configure network | Init configures it | Init configures it (before pivot) || Load GPU drivers | Shared | Shared || Mount volumes | Shared | Shared || Copy guest-agent | To `/opt/hypeman/` | To `/opt/hypeman/` || Start guest-agent | Background process | Systemd service || PID 1 | Go init binary | Systemd || App lifecycle | Managed by init | Managed by systemd |
@@ -143,6 +145,8 @@ func (l *Logger) Error(phase, msg string, err error) {
143145
// 2024-12-23T10:15:32Z [INFO] [systemd] exec /sbin/init
144146
```
145147

148+
149+
146150
## Go-based Init Binary
147151

148152
Package structure at `lib/system/init/`:
@@ -160,6 +164,8 @@ lib/system/init/
160164
logger.go # Human-readable logging to hypeman operations log
161165
```
162166

167+
168+
163169
### Main Orchestration
164170

165171
```go
@@ -216,6 +222,8 @@ func main() {
216222
}
217223
```
218224

225+
226+
219227
### Systemd Mode
220228

221229
```go
@@ -277,6 +285,8 @@ WantedBy=multi-user.target
277285
}
278286
```
279287

288+
289+
280290
## Detection Logic
281291

282292
Auto-detect systemd mode by inspecting the image's CMD. No override flag - if CMD is a systemd init, always use systemd mode.
@@ -320,6 +330,8 @@ func IsSystemdImage(entrypoint, cmd []string) bool {
320330
}
321331
```
322332

333+
334+
323335
## E2E Test
324336

325337
Custom Dockerfile in repository at `integration/testdata/systemd/Dockerfile`:
@@ -400,7 +412,7 @@ func TestExecModeUnchanged(t *testing.T) {
400412
result = execInVM(t, inst, "cat", "/proc/1/comm")
401413
assert.Equal(t, "init", strings.TrimSpace(result.Stdout))
402414
}
403-
```
404415

405416

406-
## Files to Modify/Create
417+
418+
```

Makefile

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
SHELL := /bin/bash
2-
.PHONY: oapi-generate generate-vmm-client generate-wire generate-all dev build test install-tools gen-jwt download-ch-binaries download-ch-spec ensure-ch-binaries build-caddy-binaries build-caddy ensure-caddy-binaries build-preview-cli release-prep clean
2+
.PHONY: oapi-generate generate-vmm-client generate-wire generate-all dev build test install-tools gen-jwt download-ch-binaries download-ch-spec ensure-ch-binaries build-caddy-binaries build-caddy ensure-caddy-binaries release-prep clean build-embedded
33

44
# Directory where local binaries will be installed
55
BIN_DIR ?= $(CURDIR)/bin
@@ -176,7 +176,6 @@ lib/system/init/init: lib/system/init/*.go
176176
cd lib/system/init && CGO_ENABLED=0 go build -ldflags="-s -w" -o init .
177177

178178
# Build all embedded binaries
179-
.PHONY: build-embedded
180179
build-embedded: lib/system/guest_agent/guest-agent lib/system/init/init
181180

182181
# Build the binary

cmd/api/api/exec.go

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,12 @@ var upgrader = websocket.Upgrader{
2929

3030
// ExecRequest represents the JSON body for exec requests
3131
type ExecRequest struct {
32-
Command []string `json:"command"`
33-
TTY bool `json:"tty"`
34-
Env map[string]string `json:"env,omitempty"`
35-
Cwd string `json:"cwd,omitempty"`
36-
Timeout int32 `json:"timeout,omitempty"` // seconds
32+
Command []string `json:"command"`
33+
TTY bool `json:"tty"`
34+
Env map[string]string `json:"env,omitempty"`
35+
Cwd string `json:"cwd,omitempty"`
36+
Timeout int32 `json:"timeout,omitempty"` // seconds
37+
WaitForAgent int32 `json:"wait_for_agent,omitempty"` // seconds to wait for guest agent to be ready
3738
}
3839

3940
// ExecHandler handles exec requests via WebSocket for bidirectional streaming
@@ -106,6 +107,7 @@ func (s *ApiService) ExecHandler(w http.ResponseWriter, r *http.Request) {
106107
"tty", execReq.TTY,
107108
"cwd", execReq.Cwd,
108109
"timeout", execReq.Timeout,
110+
"wait_for_agent", execReq.WaitForAgent,
109111
)
110112

111113
// Create WebSocket read/writer wrapper
@@ -122,14 +124,15 @@ func (s *ApiService) ExecHandler(w http.ResponseWriter, r *http.Request) {
122124

123125
// Execute via vsock
124126
exit, err := guest.ExecIntoInstance(ctx, dialer, guest.ExecOptions{
125-
Command: execReq.Command,
126-
Stdin: wsConn,
127-
Stdout: wsConn,
128-
Stderr: wsConn,
129-
TTY: execReq.TTY,
130-
Env: execReq.Env,
131-
Cwd: execReq.Cwd,
132-
Timeout: execReq.Timeout,
127+
Command: execReq.Command,
128+
Stdin: wsConn,
129+
Stdout: wsConn,
130+
Stderr: wsConn,
131+
TTY: execReq.TTY,
132+
Env: execReq.Env,
133+
Cwd: execReq.Cwd,
134+
Timeout: execReq.Timeout,
135+
WaitForAgent: time.Duration(execReq.WaitForAgent) * time.Second,
133136
})
134137

135138
duration := time.Since(startTime)

cmd/api/api/exec_test.go

Lines changed: 10 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -115,38 +115,24 @@ func TestExecInstanceNonTTY(t *testing.T) {
115115
t.Logf("vsock socket exists: %s", actualInst.VsockSocket)
116116
}
117117

118-
// Wait for exec agent to be ready (retry a few times)
119-
var exit *guest.ExitStatus
118+
// Wait for exec agent to be ready using WaitForAgent
120119
var stdout, stderr outputBuffer
121-
var execErr error
122120

123121
dialer, err := hypervisor.NewVsockDialer(actualInst.HypervisorType, actualInst.VsockSocket, actualInst.VsockCID)
124122
require.NoError(t, err)
125123

126124
t.Log("Testing exec command: whoami")
127-
maxRetries := 10
128-
for i := 0; i < maxRetries; i++ {
129-
stdout = outputBuffer{}
130-
stderr = outputBuffer{}
131-
132-
exit, execErr = guest.ExecIntoInstance(ctx(), dialer, guest.ExecOptions{
133-
Command: []string{"/bin/sh", "-c", "whoami"},
134-
Stdin: nil,
135-
Stdout: &stdout,
136-
Stderr: &stderr,
137-
TTY: false,
138-
})
139-
140-
if execErr == nil {
141-
break
142-
}
143-
144-
t.Logf("Exec attempt %d/%d failed, retrying: %v", i+1, maxRetries, execErr)
145-
time.Sleep(1 * time.Second)
146-
}
125+
exit, execErr := guest.ExecIntoInstance(ctx(), dialer, guest.ExecOptions{
126+
Command: []string{"/bin/sh", "-c", "whoami"},
127+
Stdin: nil,
128+
Stdout: &stdout,
129+
Stderr: &stderr,
130+
TTY: false,
131+
WaitForAgent: 10 * time.Second, // Wait up to 10s for guest agent to be ready
132+
})
147133

148134
// Assert exec worked
149-
require.NoError(t, execErr, "exec should succeed after retries")
135+
require.NoError(t, execErr, "exec should succeed")
150136
require.NotNil(t, exit, "exit status should be returned")
151137
require.Equal(t, 0, exit.Code, "whoami should exit with code 0")
152138

integration/systemd_test.go

Lines changed: 17 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -170,35 +170,25 @@ func TestSystemdMode(t *testing.T) {
170170

171171
// waitForGuestAgent polls until the guest agent is ready
172172
func waitForGuestAgent(ctx context.Context, mgr instances.Manager, instanceID string, timeout time.Duration) error {
173-
deadline := time.Now().Add(timeout)
174-
for time.Now().Before(deadline) {
175-
inst, err := mgr.GetInstance(ctx, instanceID)
176-
if err != nil {
177-
time.Sleep(500 * time.Millisecond)
178-
continue
179-
}
180-
181-
// Try to connect to the guest agent
182-
dialer, err := hypervisor.NewVsockDialer(inst.HypervisorType, inst.VsockSocket, inst.VsockCID)
183-
if err != nil {
184-
time.Sleep(500 * time.Millisecond)
185-
continue
186-
}
187-
188-
// Try a simple exec to verify agent is responding
189-
var stdout bytes.Buffer
190-
_, err = guest.ExecIntoInstance(ctx, dialer, guest.ExecOptions{
191-
Command: []string{"echo", "ready"},
192-
Stdout: &stdout,
193-
TTY: false,
194-
})
195-
if err == nil {
196-
return nil
197-
}
173+
inst, err := mgr.GetInstance(ctx, instanceID)
174+
if err != nil {
175+
return err
176+
}
198177

199-
time.Sleep(500 * time.Millisecond)
178+
dialer, err := hypervisor.NewVsockDialer(inst.HypervisorType, inst.VsockSocket, inst.VsockCID)
179+
if err != nil {
180+
return err
200181
}
201-
return context.DeadlineExceeded
182+
183+
// Use WaitForAgent to wait for the agent to be ready
184+
var stdout bytes.Buffer
185+
_, err = guest.ExecIntoInstance(ctx, dialer, guest.ExecOptions{
186+
Command: []string{"echo", "ready"},
187+
Stdout: &stdout,
188+
TTY: false,
189+
WaitForAgent: timeout,
190+
})
191+
return err
202192
}
203193

204194
// execInInstance executes a command in the instance

lib/guest/client.go

Lines changed: 43 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ const (
3030
// This can happen if:
3131
// - The VM is still booting
3232
// - The guest agent was stopped or deleted
33-
// - The VM is in systemd mode and the agent service failed to start
3433
type AgentConnectionError struct {
3534
Err error
3635
}
@@ -43,14 +42,6 @@ func (e *AgentConnectionError) Unwrap() error {
4342
return e.Err
4443
}
4544

46-
// IsAgentConnectionError checks if an error is due to the guest agent not responding.
47-
func IsAgentConnectionError(err error) bool {
48-
var agentErr *AgentConnectionError
49-
return err != nil && (strings.Contains(err.Error(), "guest agent not responding") ||
50-
strings.Contains(err.Error(), "connection refused") ||
51-
errors.As(err, &agentErr))
52-
}
53-
5445
// connPool manages reusable gRPC connections per vsock dialer key
5546
// This avoids the overhead and potential issues of rapidly creating/closing connections
5647
var connPool = struct {
@@ -122,19 +113,54 @@ type ExitStatus struct {
122113

123114
// ExecOptions configures command execution
124115
type ExecOptions struct {
125-
Command []string
126-
Stdin io.Reader
127-
Stdout io.Writer
128-
Stderr io.Writer
129-
TTY bool
130-
Env map[string]string // Environment variables
131-
Cwd string // Working directory (optional)
132-
Timeout int32 // Execution timeout in seconds (0 = no timeout)
116+
Command []string
117+
Stdin io.Reader
118+
Stdout io.Writer
119+
Stderr io.Writer
120+
TTY bool
121+
Env map[string]string // Environment variables
122+
Cwd string // Working directory (optional)
123+
Timeout int32 // Execution timeout in seconds (0 = no timeout)
124+
WaitForAgent time.Duration // Max time to wait for agent to be ready (0 = no wait, fail immediately)
133125
}
134126

135127
// ExecIntoInstance executes command in instance via vsock using gRPC.
136128
// The dialer is a hypervisor-specific VsockDialer that knows how to connect to the guest.
129+
// If WaitForAgent is set, it will retry on AgentConnectionError until the timeout.
137130
func ExecIntoInstance(ctx context.Context, dialer hypervisor.VsockDialer, opts ExecOptions) (*ExitStatus, error) {
131+
// If no wait requested, execute immediately
132+
if opts.WaitForAgent == 0 {
133+
return execIntoInstanceOnce(ctx, dialer, opts)
134+
}
135+
136+
deadline := time.Now().Add(opts.WaitForAgent)
137+
138+
for {
139+
exit, err := execIntoInstanceOnce(ctx, dialer, opts)
140+
141+
// Success or non-connection error - return immediately
142+
var connErr *AgentConnectionError
143+
if err == nil || !errors.As(err, &connErr) {
144+
return exit, err
145+
}
146+
147+
// Connection error - check if we should retry
148+
if time.Now().After(deadline) {
149+
return nil, err
150+
}
151+
152+
// Wait before retrying, but respect context cancellation
153+
select {
154+
case <-ctx.Done():
155+
return nil, ctx.Err()
156+
case <-time.After(500 * time.Millisecond):
157+
// Continue to retry
158+
}
159+
}
160+
}
161+
162+
// execIntoInstanceOnce executes command in instance via vsock using gRPC (single attempt).
163+
func execIntoInstanceOnce(ctx context.Context, dialer hypervisor.VsockDialer, opts ExecOptions) (*ExitStatus, error) {
138164
start := time.Now()
139165
var bytesSent int64
140166

lib/system/init/mode_exec.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,4 +97,3 @@ func buildEnv(env map[string]string) []string {
9797

9898
return result
9999
}
100-

lib/system/init/network.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,4 +55,3 @@ func runIP(args ...string) error {
5555
}
5656
return nil
5757
}
58-

0 commit comments

Comments
 (0)