@@ -30,7 +30,6 @@ const (
3030// This can happen if:
3131// - The VM is still booting
3232// - The guest agent was stopped or deleted
33- // - The VM is in systemd mode and the agent service failed to start
3433type AgentConnectionError struct {
3534 Err error
3635}
@@ -43,14 +42,6 @@ func (e *AgentConnectionError) Unwrap() error {
4342 return e .Err
4443}
4544
46- // IsAgentConnectionError checks if an error is due to the guest agent not responding.
47- func IsAgentConnectionError (err error ) bool {
48- var agentErr * AgentConnectionError
49- return err != nil && (strings .Contains (err .Error (), "guest agent not responding" ) ||
50- strings .Contains (err .Error (), "connection refused" ) ||
51- errors .As (err , & agentErr ))
52- }
53-
5445// connPool manages reusable gRPC connections per vsock dialer key
5546// This avoids the overhead and potential issues of rapidly creating/closing connections
5647var connPool = struct {
@@ -122,19 +113,54 @@ type ExitStatus struct {
122113
123114// ExecOptions configures command execution
124115type ExecOptions struct {
125- Command []string
126- Stdin io.Reader
127- Stdout io.Writer
128- Stderr io.Writer
129- TTY bool
130- Env map [string ]string // Environment variables
131- Cwd string // Working directory (optional)
132- Timeout int32 // Execution timeout in seconds (0 = no timeout)
116+ Command []string
117+ Stdin io.Reader
118+ Stdout io.Writer
119+ Stderr io.Writer
120+ TTY bool
121+ Env map [string ]string // Environment variables
122+ Cwd string // Working directory (optional)
123+ Timeout int32 // Execution timeout in seconds (0 = no timeout)
124+ WaitForAgent time.Duration // Max time to wait for agent to be ready (0 = no wait, fail immediately)
133125}
134126
135127// ExecIntoInstance executes command in instance via vsock using gRPC.
136128// The dialer is a hypervisor-specific VsockDialer that knows how to connect to the guest.
129+ // If WaitForAgent is set, it will retry on AgentConnectionError until the timeout.
137130func ExecIntoInstance (ctx context.Context , dialer hypervisor.VsockDialer , opts ExecOptions ) (* ExitStatus , error ) {
131+ // If no wait requested, execute immediately
132+ if opts .WaitForAgent == 0 {
133+ return execIntoInstanceOnce (ctx , dialer , opts )
134+ }
135+
136+ deadline := time .Now ().Add (opts .WaitForAgent )
137+
138+ for {
139+ exit , err := execIntoInstanceOnce (ctx , dialer , opts )
140+
141+ // Success or non-connection error - return immediately
142+ var connErr * AgentConnectionError
143+ if err == nil || ! errors .As (err , & connErr ) {
144+ return exit , err
145+ }
146+
147+ // Connection error - check if we should retry
148+ if time .Now ().After (deadline ) {
149+ return nil , err
150+ }
151+
152+ // Wait before retrying, but respect context cancellation
153+ select {
154+ case <- ctx .Done ():
155+ return nil , ctx .Err ()
156+ case <- time .After (500 * time .Millisecond ):
157+ // Continue to retry
158+ }
159+ }
160+ }
161+
162+ // execIntoInstanceOnce executes command in instance via vsock using gRPC (single attempt).
163+ func execIntoInstanceOnce (ctx context.Context , dialer hypervisor.VsockDialer , opts ExecOptions ) (* ExitStatus , error ) {
138164 start := time .Now ()
139165 var bytesSent int64
140166
0 commit comments