Skip to content

Commit dad9814

Browse files
committed
libvirt/ssh: Enhance SSH timeouts with time-based retry
Reduce default SSH timeout from 30s to 5s. Implement 60-second time-based retry with 2-second polling intervals. Retry on all errors instead of specific patterns. Suggest 'virsh console' in error messages for debugging. Assisted-by: Claude Code (Sonnet 4.5) Signed-off-by: gursewak1997 <[email protected]>
1 parent 8f8d301 commit dad9814

File tree

2 files changed

+131
-64
lines changed

2 files changed

+131
-64
lines changed

crates/kit/src/libvirt/ssh.rs

Lines changed: 129 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,14 @@ use std::io::Write;
1515
use std::os::unix::fs::PermissionsExt as _;
1616
use std::os::unix::process::CommandExt;
1717
use std::process::Command;
18+
use std::time::{Duration, Instant};
1819
use tempfile;
1920
use tracing::debug;
2021

22+
// SSH retry configuration
23+
const SSH_RETRY_TIMEOUT_SECS: u64 = 60; // Total time to retry SSH connections
24+
const SSH_POLL_DELAY_SECS: u64 = 2; // Delay between SSH attempts
25+
2126
/// Configuration options for SSH connection to libvirt domain
2227
#[derive(Debug, Parser)]
2328
pub struct LibvirtSshOpts {
@@ -36,7 +41,7 @@ pub struct LibvirtSshOpts {
3641
pub strict_host_keys: bool,
3742

3843
/// SSH connection timeout in seconds
39-
#[clap(long, default_value = "30")]
44+
#[clap(long, default_value = "5")]
4045
pub timeout: u32,
4146

4247
/// SSH log level
@@ -236,8 +241,39 @@ impl LibvirtSshOpts {
236241
Ok(temp_key)
237242
}
238243

239-
/// Execute SSH connection to domain
240-
fn connect_ssh(&self, ssh_config: &DomainSshConfig) -> Result<()> {
244+
/// Build SSH command with configured options
245+
fn build_ssh_command(
246+
&self,
247+
ssh_config: &DomainSshConfig,
248+
temp_key: &tempfile::NamedTempFile,
249+
parsed_extra_options: Vec<(String, String)>,
250+
) -> Command {
251+
let mut ssh_cmd = Command::new("ssh");
252+
ssh_cmd
253+
.arg("-i")
254+
.arg(temp_key.path())
255+
.arg("-p")
256+
.arg(ssh_config.ssh_port.to_string());
257+
258+
let common_opts = crate::ssh::CommonSshOptions {
259+
strict_host_keys: self.strict_host_keys,
260+
connect_timeout: self.timeout,
261+
server_alive_interval: 60,
262+
log_level: self.log_level.clone(),
263+
extra_options: parsed_extra_options,
264+
};
265+
common_opts.apply_to_command(&mut ssh_cmd);
266+
ssh_cmd.arg(format!("{}@127.0.0.1", self.user));
267+
268+
ssh_cmd
269+
}
270+
271+
/// Execute SSH connection to domain with retries
272+
fn connect_ssh(
273+
&self,
274+
_global_opts: &crate::libvirt::LibvirtOptions,
275+
ssh_config: &DomainSshConfig,
276+
) -> Result<()> {
241277
debug!(
242278
"Connecting to domain '{}' via SSH on port {} (user: {})",
243279
self.domain_name, ssh_config.ssh_port, self.user
@@ -250,17 +286,7 @@ impl LibvirtSshOpts {
250286
// Create temporary SSH key file
251287
let temp_key = self.create_temp_ssh_key(ssh_config)?;
252288

253-
// Build SSH command
254-
let mut ssh_cmd = Command::new("ssh");
255-
256-
// Add SSH key and port
257-
ssh_cmd
258-
.arg("-i")
259-
.arg(temp_key.path())
260-
.arg("-p")
261-
.arg(ssh_config.ssh_port.to_string());
262-
263-
// Parse extra options from key=value format
289+
// Parse extra options
264290
let mut parsed_extra_options = Vec::new();
265291
for option in &self.extra_options {
266292
if let Some((key, value)) = option.split_once('=') {
@@ -273,76 +299,117 @@ impl LibvirtSshOpts {
273299
}
274300
}
275301

276-
// Apply common SSH options
277-
let common_opts = crate::ssh::CommonSshOptions {
278-
strict_host_keys: self.strict_host_keys,
279-
connect_timeout: self.timeout,
280-
server_alive_interval: 60,
281-
log_level: self.log_level.clone(),
282-
extra_options: parsed_extra_options,
283-
};
284-
common_opts.apply_to_command(&mut ssh_cmd);
302+
let start_time = Instant::now();
303+
let timeout = Duration::from_secs(SSH_RETRY_TIMEOUT_SECS);
285304

286-
// Target host
287-
ssh_cmd.arg(format!("{}@127.0.0.1", self.user));
305+
// For interactive SSH, test connectivity first with retries, then exec
306+
if self.command.is_empty() {
307+
debug!("Testing SSH connectivity before interactive session");
308+
309+
// Retry until timeout
310+
loop {
311+
let mut test_cmd =
312+
self.build_ssh_command(ssh_config, &temp_key, parsed_extra_options.clone());
313+
test_cmd.arg("--").arg("true"); // Simple test command
314+
315+
match test_cmd.output() {
316+
Ok(output) if output.status.success() => {
317+
// SSH is ready, now exec for interactive session
318+
debug!("SSH ready, launching interactive session");
319+
let mut ssh_cmd =
320+
self.build_ssh_command(ssh_config, &temp_key, parsed_extra_options);
321+
let error = ssh_cmd.exec();
322+
return Err(eyre!("Failed to exec SSH command: {}", error));
323+
}
324+
Ok(output) => {
325+
// Check if we've exceeded timeout
326+
if start_time.elapsed() >= timeout {
327+
if !self.suppress_output {
328+
let stderr_str = String::from_utf8_lossy(&output.stderr);
329+
eprint!("{}", stderr_str);
330+
eprintln!(
331+
"\nSSH connection failed after {:.1}s. To see VM console output, run: virsh console {}",
332+
start_time.elapsed().as_secs_f64(),
333+
self.domain_name
334+
);
335+
}
336+
return Err(eyre!("SSH connection failed after timeout"));
337+
}
338+
339+
// Retry all errors - show message on first attempt
340+
if start_time.elapsed().as_secs() < SSH_POLL_DELAY_SECS
341+
&& !self.suppress_output
342+
{
343+
eprintln!("Waiting for SSH to be ready...");
344+
}
345+
346+
std::thread::sleep(Duration::from_secs(SSH_POLL_DELAY_SECS));
347+
}
348+
Err(e) => {
349+
return Err(eyre!("Failed to spawn SSH command: {}", e));
350+
}
351+
}
352+
}
353+
}
354+
355+
// For command execution: retry with timeout
356+
loop {
357+
debug!(
358+
"SSH connection attempt (elapsed: {:.1}s)",
359+
start_time.elapsed().as_secs_f64()
360+
);
288361

289-
// Add command if specified - use the same argument escaping logic as container SSH
290-
if !self.command.is_empty() {
362+
// Build SSH command
363+
let mut ssh_cmd =
364+
self.build_ssh_command(ssh_config, &temp_key, parsed_extra_options.clone());
365+
366+
// Add command
291367
ssh_cmd.arg("--");
292368
if self.command.len() > 1 {
293-
// Multiple arguments need proper shell escaping
294369
let combined_command = crate::ssh::shell_escape_command(&self.command)
295370
.map_err(|e| eyre!("Failed to escape shell command: {}", e))?;
296-
debug!("Combined escaped command: {}", combined_command);
297371
ssh_cmd.arg(combined_command);
298372
} else {
299-
// Single argument can be passed directly
300373
ssh_cmd.args(&self.command);
301374
}
302-
}
303-
304-
debug!("Executing SSH command: {:?}", ssh_cmd);
305-
306-
// For commands (non-interactive SSH), capture output
307-
// For interactive SSH (no command), exec to replace current process
308-
if self.command.is_empty() {
309-
// Interactive SSH - exec to replace the current process
310-
// This provides the cleanest terminal experience
311-
debug!("Executing interactive SSH session via exec");
312375

313-
let error = ssh_cmd.exec();
314-
// exec() only returns on error
315-
return Err(eyre!("Failed to exec SSH command: {}", error));
316-
} else {
317-
// Command execution - capture and forward output
376+
// Try SSH
318377
let output = ssh_cmd
319378
.output()
320379
.map_err(|e| eyre!("Failed to execute SSH command: {}", e))?;
321380

322-
if !output.stdout.is_empty() {
323-
if !self.suppress_output {
324-
// Forward stdout to parent process
381+
if output.status.success() {
382+
if !output.stdout.is_empty() && !self.suppress_output {
325383
print!("{}", String::from_utf8_lossy(&output.stdout));
326384
}
327-
debug!("SSH stdout: {}", String::from_utf8_lossy(&output.stdout));
385+
debug!(
386+
"SSH connected after {:.1}s",
387+
start_time.elapsed().as_secs_f64()
388+
);
389+
return Ok(());
328390
}
329-
if !output.stderr.is_empty() {
391+
392+
// Check if we've exceeded timeout
393+
if start_time.elapsed() >= timeout {
394+
// Timeout - fail with helpful message
330395
if !self.suppress_output {
331-
// Forward stderr to parent process
332-
eprint!("{}", String::from_utf8_lossy(&output.stderr));
396+
let stderr_str = String::from_utf8_lossy(&output.stderr);
397+
eprint!("{}", stderr_str);
398+
eprintln!(
399+
"\nSSH connection failed after {:.1}s. To see VM console output, run: virsh console {}",
400+
start_time.elapsed().as_secs_f64(),
401+
self.domain_name
402+
);
333403
}
334-
debug!("SSH stderr: {}", String::from_utf8_lossy(&output.stderr));
335-
}
336-
337-
if !output.status.success() {
338404
return Err(eyre!(
339-
"SSH connection failed with exit code: {}",
340-
output.status.code().unwrap_or(-1)
405+
"SSH connection failed after {:.1}s",
406+
start_time.elapsed().as_secs_f64()
341407
));
342408
}
343-
}
344409

345-
Ok(())
410+
// Retry all errors
411+
std::thread::sleep(Duration::from_secs(SSH_POLL_DELAY_SECS));
412+
}
346413
}
347414
}
348415

@@ -377,8 +444,8 @@ pub fn run_ssh_impl(
377444
// Extract SSH configuration from domain metadata
378445
let ssh_config = opts.extract_ssh_config(global_opts)?;
379446

380-
// Connect via SSH
381-
opts.connect_ssh(&ssh_config)?;
447+
// Connect via SSH with retries
448+
opts.connect_ssh(global_opts, &ssh_config)?;
382449

383450
Ok(())
384451
}

crates/kit/src/ssh.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,7 @@ impl Default for CommonSshOptions {
220220
fn default() -> Self {
221221
Self {
222222
strict_host_keys: false,
223-
connect_timeout: 30,
223+
connect_timeout: 5,
224224
server_alive_interval: 60,
225225
log_level: "ERROR".to_string(),
226226
extra_options: vec![],
@@ -363,7 +363,7 @@ mod tests {
363363
fn test_ssh_connection_options() {
364364
// Test default options
365365
let default_opts = SshConnectionOptions::default();
366-
assert_eq!(default_opts.common.connect_timeout, 30);
366+
assert_eq!(default_opts.common.connect_timeout, 5);
367367
assert!(default_opts.allocate_tty);
368368
assert_eq!(default_opts.common.log_level, "ERROR");
369369
assert!(default_opts.common.extra_options.is_empty());

0 commit comments

Comments
 (0)