From 36d5bed8e9dd5c3ff4db2cad1b5cb6803db592d1 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Mon, 14 Oct 2024 17:18:54 +0100 Subject: [PATCH] Combine SSHConnection._init_connection and Microvm.wait_for_ssh_up We use `Microvm.wait_for_ssh_up` to wait for a booted guest's userland to be initialized enough that it can process incoming SSH connections. However, it turns out that we werent waiting the intended 10s, we were waiting 3s. The reason for this is that before the `true` command in `wait_for_ssh_up` is even sent to the guest, we go through the `Microvm.ssh` property, which calls the `Microvm.ssh_face` function, which constructs an `SSHConnection` object, whose construtor calls `SSHConnection._init_connection`. And inside `_init_connection`, we _already_ try to send a `true` command to the guest, to ascertain it is up and running. However, here we do it in a retry loop (needed because if port 22 is still closed, `ssh` exits with non-zero code and "connection refuses") that does 20 retries at 0.15s intervals. Or tries to 3 seconds. Most importantly, the retries in `_init_connection` are done without timeouts, so even _if_ we actually hung during connection establishment, `wait_for_ssh_up` wouldn't abort after 10 seconds, because it would get stuck inside of `_init_connection`, which doesn't know anything about a 10s timeout. Fix all this up by merging `_init_connection` and `wait_for_ssh_up`. Skip sending the `true` command in `wait_for_ssh_up`, and just call `ssh_iface(0)` directly. Increment the retry time in `_init_connection` to actually be 10s. And add a 10s timeout to the commands in `_init_connection`. Signed-off-by: Patrick Roy --- tests/framework/microvm.py | 22 ++++++------------- tests/host_tools/network.py | 4 ++-- .../functional/test_pause_resume.py | 8 +++---- 3 files changed, 13 insertions(+), 21 deletions(-) diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py index ff79c8f2f00..7a0b57cf610 100644 --- a/tests/framework/microvm.py +++ b/tests/framework/microvm.py @@ -18,7 +18,6 @@ import select import shutil import signal -import subprocess import time import uuid from collections import namedtuple @@ -1003,26 +1002,19 @@ def thread_backtraces(self): ) return "\n".join(backtraces) - def wait_for_ssh_up(self, timeout=10): - """Wait for guest running inside the microVM to come up and respond. - - :param timeout: seconds to wait. - """ + def wait_for_ssh_up(self): + """Wait for guest running inside the microVM to come up and respond.""" try: - rc, stdout, stderr = self.ssh.run("true", timeout) - except subprocess.TimeoutExpired: + # Ensure that we have an initialized SSH connection to the guest that can + # run commands. The actual connection retry loop happens in SSHConnection._init_connection + self.ssh_iface(0) + except Exception as exc: print( - f"Remote command did not respond within {timeout}s\n\n" + f"Failed to establish SSH connection to guest: {exc}\n\n" f"Firecracker logs:\n{self.log_data}\n" f"Thread backtraces:\n{self.thread_backtraces}" ) raise - assert rc == 0, ( - f"Remote command exited with non-0 status code\n\n" - f"{rc=}\n{stdout=}\n{stderr=}\n\n" - f"Firecracker logs:\n{self.log_data}\n" - f"Thread backtraces:\n{self.thread_backtraces}" - ) class MicroVMFactory: diff --git a/tests/host_tools/network.py b/tests/host_tools/network.py index ade81738626..316fdec5424 100644 --- a/tests/host_tools/network.py +++ b/tests/host_tools/network.py @@ -78,7 +78,7 @@ def scp_get(self, remote_path, local_path, recursive=False): @retry( retry=retry_if_exception_type(ChildProcessError), - wait=wait_fixed(0.15), + wait=wait_fixed(0.5), stop=stop_after_attempt(20), reraise=True, ) @@ -90,7 +90,7 @@ def _init_connection(self): We'll keep trying to execute a remote command that can't fail (`/bin/true`), until we get a successful (0) exit code. """ - self.check_output("true") + self.check_output("true", timeout=10) def run(self, cmd_string, timeout=None, *, check=False): """Execute the command passed as a string in the ssh context.""" diff --git a/tests/integration_tests/functional/test_pause_resume.py b/tests/integration_tests/functional/test_pause_resume.py index c6dfba2cb0f..4210e3a0d7f 100644 --- a/tests/integration_tests/functional/test_pause_resume.py +++ b/tests/integration_tests/functional/test_pause_resume.py @@ -49,16 +49,16 @@ def test_pause_resume(uvm_nano): microvm.flush_metrics() # Verify guest is no longer active. - with pytest.raises(AssertionError): - microvm.wait_for_ssh_up() + with pytest.raises(ChildProcessError): + microvm.ssh.check_output("true") # Verify emulation was indeed paused and no events from either # guest or host side were handled. verify_net_emulation_paused(microvm.flush_metrics()) # Verify guest is no longer active. - with pytest.raises(AssertionError): - microvm.wait_for_ssh_up() + with pytest.raises(ChildProcessError): + microvm.ssh.check_output("true") # Pausing the microVM when it is already `Paused` is allowed # (microVM remains in `Paused` state).