diff --git a/CHANGELOG.md b/CHANGELOG.md index 9c12bf61ecf..2aaf2a579d2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,15 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## \[Unreleased\] + +### Fixed + +- [#4836](https://github.com/firecracker-microvm/firecracker/pull/4836): Fixed + Vsock not notifying guest about `TRANSPORT_RESET_EVENT` event after snapshot + restore. This resulted in guest waiting indefinitely on a connection which was + reset during snapshot creation. + ## \[1.9.1\] ### Fixed diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 577632c728f..c63f7fb513f 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -33,7 +33,7 @@ use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::mmio::MmioTransport; use crate::devices::virtio::net::Net; use crate::devices::virtio::rng::Entropy; -use crate::devices::virtio::vsock::TYPE_VSOCK; +use crate::devices::virtio::vsock::{Vsock, VsockUnixBackend, TYPE_VSOCK}; use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET, TYPE_RNG}; use crate::devices::BusDevice; #[cfg(target_arch = "x86_64")] @@ -486,6 +486,16 @@ impl MMIODeviceManager { // so for Vsock we don't support connection persistence through snapshot. // Any in-flight packets or events are simply lost. // Vsock is restored 'empty'. + // The only reason we still `kick` it is to make guest process + // `TRANSPORT_RESET_EVENT` event we sent during snapshot creation. + let vsock = virtio + .as_mut_any() + .downcast_mut::>() + .unwrap(); + if vsock.is_activated() { + info!("kick vsock {id}."); + vsock.signal_used_queue().unwrap(); + } } TYPE_RNG => { let entropy = virtio.as_mut_any().downcast_mut::().unwrap(); diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 7a51bf790e9..1a7505fb1dd 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -365,11 +365,6 @@ impl<'a> Persist<'a> for MMIODeviceManager { .downcast_mut::>() .unwrap(); - let vsock_state = VsockState { - backend: vsock.backend().save(), - frontend: vsock.save(), - }; - // Send Transport event to reset connections if device // is activated. if vsock.is_activated() { @@ -378,6 +373,13 @@ impl<'a> Persist<'a> for MMIODeviceManager { }); } + // Save state after potential notification to the guest. This + // way we save changes to the queue the notification can cause. + let vsock_state = VsockState { + backend: vsock.backend().save(), + frontend: vsock.save(), + }; + states.vsock_device = Some(ConnectedVsockState { device_id: devid.clone(), device_state: vsock_state, diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py index b6dd71ef713..2b23f4baff0 100644 --- a/tests/framework/microvm.py +++ b/tests/framework/microvm.py @@ -653,6 +653,8 @@ def spawn( # and leave 0.2 delay between them. if "no-api" not in self.jailer.extra_args: self._wait_create() + if "config-file" in self.jailer.extra_args and self.iface: + self.wait_for_up() if self.log_file and log_level in ("Trace", "Debug", "Info"): self.check_log_message("Running Firecracker") @@ -876,6 +878,9 @@ def start(self): # Check that the VM has started assert self.state == "Running" + if self.iface: + self.wait_for_up() + def pause(self): """Pauses the microVM""" self.api.vm.patch(state="Paused") @@ -956,6 +961,9 @@ def restore_from_snapshot( enable_diff_snapshots=snapshot.is_diff, resume_vm=resume, ) + # This is not a "wait for boot", but rather a "VM still works after restoration" + if snapshot.net_ifaces and resume: + self.wait_for_up() return jailed_snapshot def enable_entropy_device(self): diff --git a/tests/host_tools/udp_offload.py b/tests/host_tools/udp_offload.py new file mode 100644 index 00000000000..e9ab6a93966 --- /dev/null +++ b/tests/host_tools/udp_offload.py @@ -0,0 +1,58 @@ +# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 +""" +A utility for sending a UDP message with UDP oflload enabled. + +Inspired by the "TUN_F_CSUM is a must" chapter +in https://blog.cloudflare.com/fr-fr/virtual-networking-101-understanding-tap/ +by Cloudflare. +""" + +import socket +import sys + + +def eprint(*args, **kwargs): + """Print to stderr""" + print(*args, file=sys.stderr, **kwargs) + + +# Define SOL_UDP and UDP_SEGMENT if not defined in the system headers +try: + from socket import SOL_UDP, UDP_SEGMENT +except ImportError: + SOL_UDP = 17 # Protocol number for UDP + UDP_SEGMENT = 103 # Option code for UDP segmentation (non-standard) + +# Get the IP and port from command-line arguments +if len(sys.argv) != 3: + eprint("Usage: python3 udp_offload.py ") + sys.exit(1) + +ip_address = sys.argv[1] +port = int(sys.argv[2]) + +# Create a UDP socket +sockfd = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + +# Set the UDP segmentation option (UDP_SEGMENT) to 1400 bytes +OPTVAL = 1400 +try: + sockfd.setsockopt(SOL_UDP, UDP_SEGMENT, OPTVAL) +except (AttributeError, PermissionError): + eprint("Unable to set UDP_SEGMENT option") + sys.exit(1) + +# Set the destination address and port +servaddr = (ip_address, port) + +# Send the message to the destination address +MESSAGE = b"x" +try: + sockfd.sendto(MESSAGE, servaddr) + print("Message sent successfully") +except socket.error as e: + eprint(f"Error sending message: {e}") + sys.exit(1) + +sockfd.close() diff --git a/tests/integration_tests/functional/test_api.py b/tests/integration_tests/functional/test_api.py index 94ae1cd453b..5aebe7b5265 100644 --- a/tests/integration_tests/functional/test_api.py +++ b/tests/integration_tests/functional/test_api.py @@ -83,7 +83,6 @@ def test_drive_io_engine(uvm_plain): test_microvm.api.drive.put(**kwargs) test_microvm.start() - test_microvm.wait_for_up() assert test_microvm.api.vm_config.get().json()["drives"][0]["io_engine"] == "Sync" @@ -1166,7 +1165,6 @@ def test_get_full_config_after_restoring_snapshot(microvm_factory, uvm_nano): } ] - uvm_nano.wait_for_up() snapshot = uvm_nano.snapshot_full() uvm2 = microvm_factory.build() uvm2.spawn() diff --git a/tests/integration_tests/functional/test_balloon.py b/tests/integration_tests/functional/test_balloon.py index cd4b3d33637..8d7973db273 100644 --- a/tests/integration_tests/functional/test_balloon.py +++ b/tests/integration_tests/functional/test_balloon.py @@ -74,10 +74,8 @@ def make_guest_dirty_memory(ssh_connection, amount_mib=32): logger.error("while running: %s", cmd) logger.error("stdout: %s", stdout) logger.error("stderr: %s", stderr) - - cmd = "cat /tmp/fillmem_output.txt" except TimeoutExpired: - # It's ok if this expires. Some times the SSH connection + # It's ok if this expires. Sometimes the SSH connection # gets killed by the OOM killer *after* the fillmem program # started. As a result, we can ignore timeouts here. pass @@ -198,7 +196,6 @@ def test_deflate_on_oom(uvm_plain_any, deflate_on_oom): # Start the microvm. test_microvm.start() - test_microvm.wait_for_up() firecracker_pid = test_microvm.firecracker_pid # We get an initial reading of the RSS, then calculate the amount @@ -243,7 +240,6 @@ def test_reinflate_balloon(uvm_plain_any): # Start the microvm. test_microvm.start() - test_microvm.wait_for_up() firecracker_pid = test_microvm.firecracker_pid # First inflate the balloon to free up the uncertain amount of memory @@ -340,16 +336,27 @@ def test_stats(uvm_plain_any): # Add a memory balloon with stats enabled. test_microvm.api.balloon.put( - amount_mib=0, deflate_on_oom=True, stats_polling_interval_s=1 + amount_mib=0, + deflate_on_oom=True, + stats_polling_interval_s=STATS_POLLING_INTERVAL_S, ) # Start the microvm. test_microvm.start() firecracker_pid = test_microvm.firecracker_pid + # Give Firecracker enough time to poll the stats at least once post-boot + time.sleep(STATS_POLLING_INTERVAL_S * 2) + # Get an initial reading of the stats. initial_stats = test_microvm.api.balloon_stats.get().json() + # Major faults happen when a page fault has to be satisfied from disk. They are not + # triggered by our `make_guest_dirty_memory` workload, as it uses MAP_ANONYMOUS, which + # only triggers minor faults. However, during the boot process, things are read from the + # rootfs, so we should at least see a non-zero number of major faults. + assert initial_stats["major_faults"] > 0 + # Dirty 10MB of pages. make_guest_dirty_memory(test_microvm.ssh, amount_mib=10) time.sleep(1) @@ -359,7 +366,6 @@ def test_stats(uvm_plain_any): # Make sure that the stats catch the page faults. after_workload_stats = test_microvm.api.balloon_stats.get().json() assert initial_stats.get("minor_faults", 0) < after_workload_stats["minor_faults"] - assert initial_stats.get("major_faults", 0) < after_workload_stats["major_faults"] # Now inflate the balloon with 10MB of pages. test_microvm.api.balloon.patch(amount_mib=10) @@ -482,8 +488,6 @@ def test_balloon_snapshot(microvm_factory, guest_kernel, rootfs): microvm.spawn() microvm.restore_from_snapshot(snapshot, resume=True) - microvm.wait_for_up() - # Get the firecracker from snapshot pid, and open an ssh connection. firecracker_pid = microvm.firecracker_pid @@ -520,24 +524,6 @@ def test_balloon_snapshot(microvm_factory, guest_kernel, rootfs): assert stats_after_snap["available_memory"] > latest_stats["available_memory"] -def test_snapshot_compatibility(microvm_factory, guest_kernel, rootfs): - """ - Test that the balloon serializes correctly. - """ - vm = microvm_factory.build(guest_kernel, rootfs) - vm.spawn() - vm.basic_config( - vcpu_count=2, - mem_size_mib=256, - ) - - # Add a memory balloon with stats enabled. - vm.api.balloon.put(amount_mib=0, deflate_on_oom=True, stats_polling_interval_s=1) - - vm.start() - vm.snapshot_full() - - def test_memory_scrub(microvm_factory, guest_kernel, rootfs): """ Test that the memory is zeroed after deflate. diff --git a/tests/integration_tests/functional/test_cmd_line_start.py b/tests/integration_tests/functional/test_cmd_line_start.py index 82425458e89..096bea38c6c 100644 --- a/tests/integration_tests/functional/test_cmd_line_start.py +++ b/tests/integration_tests/functional/test_cmd_line_start.py @@ -7,7 +7,6 @@ import platform import re import shutil -import time from pathlib import Path import pytest @@ -164,7 +163,6 @@ def test_config_start_no_api_exit(uvm_plain, vm_config_file): test_microvm.jailer.extra_args.update({"no-api": None}) test_microvm.spawn() # Start Firecracker and MicroVM - time.sleep(3) # Wait for startup test_microvm.ssh.run("reboot") # Exit test_microvm.mark_killed() # waits for process to terminate @@ -266,7 +264,7 @@ def test_config_start_with_limit(uvm_plain, vm_config_file): response += '{ "error": "Request payload with size 260 is larger than ' response += "the limit of 250 allowed by server.\n" response += 'All previous unanswered requests will be dropped." }' - _, stdout, _stderr = utils.check_output(cmd) + _, stdout, _ = utils.check_output(cmd) assert stdout.encode("utf-8") == response.encode("utf-8") @@ -421,8 +419,6 @@ def test_config_start_and_mmds_with_api(uvm_plain, vm_config_file): # Network namespace has already been created. test_microvm.spawn() - assert test_microvm.state == "Running" - data_store = { "latest": { "meta-data": {"ami-id": "ami-12345678", "reservation-id": "r-fea54097"} @@ -434,7 +430,7 @@ def test_config_start_and_mmds_with_api(uvm_plain, vm_config_file): assert response.json() == {} # Populate MMDS with data. - response = test_microvm.api.mmds.put(**data_store) + test_microvm.api.mmds.put(**data_store) # Ensure the MMDS contents have been successfully updated. response = test_microvm.api.mmds.get() diff --git a/tests/integration_tests/functional/test_concurrency.py b/tests/integration_tests/functional/test_concurrency.py index 66712e92cf6..e4756729f2b 100644 --- a/tests/integration_tests/functional/test_concurrency.py +++ b/tests/integration_tests/functional/test_concurrency.py @@ -19,7 +19,6 @@ def launch1(): microvm.basic_config(vcpu_count=1, mem_size_mib=128) microvm.add_net_iface() microvm.start() - microvm.wait_for_up() with ThreadPoolExecutor(max_workers=NO_OF_MICROVMS) as tpe: for _ in range(NO_OF_MICROVMS): diff --git a/tests/integration_tests/functional/test_cpu_features.py b/tests/integration_tests/functional/test_cpu_features.py index 8f0d5884c34..ac2b391c73b 100644 --- a/tests/integration_tests/functional/test_cpu_features.py +++ b/tests/integration_tests/functional/test_cpu_features.py @@ -672,7 +672,6 @@ def test_cpu_template(uvm_plain_any, cpu_template, microvm_factory): restored_vm = microvm_factory.build() restored_vm.spawn() restored_vm.restore_from_snapshot(snapshot, resume=True) - restored_vm.wait_for_up() check_masked_features(restored_vm, cpu_template) check_enabled_features(restored_vm, cpu_template) diff --git a/tests/integration_tests/functional/test_cpu_features_aarch64.py b/tests/integration_tests/functional/test_cpu_features_aarch64.py index 634c72d1692..d45fd1a83c3 100644 --- a/tests/integration_tests/functional/test_cpu_features_aarch64.py +++ b/tests/integration_tests/functional/test_cpu_features_aarch64.py @@ -113,7 +113,6 @@ def test_cpu_features_with_static_template( restored_vm = microvm_factory.build() restored_vm.spawn() restored_vm.restore_from_snapshot(snapshot, resume=True) - restored_vm.wait_for_up() _check_cpu_features_arm(restored_vm, guest_kv, "v1n1") @@ -143,5 +142,4 @@ def test_cpu_features_with_custom_template( restored_vm = microvm_factory.build() restored_vm.spawn() restored_vm.restore_from_snapshot(snapshot, resume=True) - restored_vm.wait_for_up() _check_cpu_features_arm(restored_vm, guest_kv, custom_cpu_template["name"]) diff --git a/tests/integration_tests/functional/test_dirty_pages_in_full_snapshot.py b/tests/integration_tests/functional/test_dirty_pages_in_full_snapshot.py index 75f0cdda2d6..d25afa083d0 100644 --- a/tests/integration_tests/functional/test_dirty_pages_in_full_snapshot.py +++ b/tests/integration_tests/functional/test_dirty_pages_in_full_snapshot.py @@ -14,7 +14,6 @@ def test_dirty_pages_after_full_snapshot(uvm_plain): uvm.basic_config(mem_size_mib=vm_mem_size, track_dirty_pages=True) uvm.add_net_iface() uvm.start() - uvm.wait_for_up() snap_full = uvm.snapshot_full(vmstate_path="vmstate_full", mem_path="mem_full") snap_diff = uvm.snapshot_diff(vmstate_path="vmstate_diff", mem_path="mem_diff") diff --git a/tests/integration_tests/functional/test_drive_vhost_user.py b/tests/integration_tests/functional/test_drive_vhost_user.py index 0cd1a5afd16..31a11a75661 100644 --- a/tests/integration_tests/functional/test_drive_vhost_user.py +++ b/tests/integration_tests/functional/test_drive_vhost_user.py @@ -56,7 +56,6 @@ def test_vhost_user_block(microvm_factory, guest_kernel, rootfs_ubuntu_22): "vhost_user_block", 1, aggr_supported=False ) vm.start() - vm.wait_for_up() # Now check that vhost-user-block with rw is last. # 1-0 means line 1, column 0. @@ -91,7 +90,6 @@ def test_vhost_user_block_read_write(microvm_factory, guest_kernel, rootfs_ubunt vm.add_vhost_user_drive("rootfs", rootfs_rw, is_root_device=True) vm.add_net_iface() vm.start() - vm.wait_for_up() # Now check that vhost-user-block with rw is last. # 1-0 means line 1, column 0. @@ -120,7 +118,6 @@ def test_vhost_user_block_disconnect(microvm_factory, guest_kernel, rootfs_ubunt ) vm.add_net_iface() vm.start() - vm.wait_for_up() # Killing the backend vm.disks_vhost_user["rootfs"].kill() @@ -231,7 +228,6 @@ def test_partuuid_boot( ) vm.add_net_iface() vm.start() - vm.wait_for_up() # Now check that vhost-user-block with rw is last. # 1-0 means line 1, column 0. @@ -275,7 +271,6 @@ def test_partuuid_update(microvm_factory, guest_kernel, rootfs_ubuntu_22): "vhost_user_block", 1, aggr_supported=False ) vm.start() - vm.wait_for_up() # Now check that vhost-user-block with rw is last. # 1-0 means line 1, column 0. diff --git a/tests/integration_tests/functional/test_mmds.py b/tests/integration_tests/functional/test_mmds.py index abc9f25ee54..51ea6358631 100644 --- a/tests/integration_tests/functional/test_mmds.py +++ b/tests/integration_tests/functional/test_mmds.py @@ -94,7 +94,6 @@ def _validate_mmds_snapshot( microvm = microvm_factory.build(**kwargs) microvm.spawn() microvm.restore_from_snapshot(snapshot, resume=True) - microvm.wait_for_up() ssh_connection = microvm.ssh diff --git a/tests/integration_tests/functional/test_net.py b/tests/integration_tests/functional/test_net.py index c177b82267d..12980c727b2 100644 --- a/tests/integration_tests/functional/test_net.py +++ b/tests/integration_tests/functional/test_net.py @@ -78,3 +78,65 @@ def test_multi_queue_unsupported(uvm_plain): host_dev_name=tapname, guest_mac="AA:FC:00:00:00:01", ) + + +def run_udp_offload_test(vm): + """ + - Start a socat UDP server in the guest. + - Try to send a UDP message with UDP offload enabled. + + If tap offload features are not configured, an attempt to send a message will fail with EIO "Input/output error". + More info (search for "TUN_F_CSUM is a must"): https://blog.cloudflare.com/fr-fr/virtual-networking-101-understanding-tap/ + """ + port = "81" + out_filename = "/tmp/out.txt" + message = "x" + + # Start a UDP server in the guest + # vm.ssh.check_output(f"nohup socat UDP-LISTEN:{port} - > {out_filename} &") + vm.ssh.check_output( + f"nohup socat UDP-LISTEN:{port} OPEN:{out_filename},creat > /dev/null 2>&1 &" + ) + + # Try to send a UDP message from host with UDP offload enabled + cmd = f"ip netns exec {vm.ssh_iface().netns} python3 ./host_tools/udp_offload.py {vm.ssh_iface().host} {port}" + ret = utils.run_cmd(cmd) + + # Check that the transmission was successful + assert ret.returncode == 0, f"{ret.stdout=} {ret.stderr=}" + + # Check that the server received the message + ret = vm.ssh.run(f"cat {out_filename}") + assert ret.stdout == message, f"{ret.stdout=} {ret.stderr=}" + + +def test_tap_offload_booted(uvm_plain_any): + """ + Verify that tap offload features are configured for a booted VM. + """ + vm = uvm_plain_any + vm.spawn() + vm.basic_config() + vm.add_net_iface() + vm.start() + + run_udp_offload_test(vm) + + +def test_tap_offload_restored(microvm_factory, guest_kernel, rootfs_ubuntu_22): + """ + Verify that tap offload features are configured for a restored VM. + """ + src = microvm_factory.build(guest_kernel, rootfs_ubuntu_22, monitor_memory=False) + src.spawn() + src.basic_config() + src.add_net_iface() + src.start() + snapshot = src.snapshot_full() + src.kill() + + dst = microvm_factory.build() + dst.spawn() + dst.restore_from_snapshot(snapshot, resume=True) + + run_udp_offload_test(dst) diff --git a/tests/integration_tests/functional/test_pause_resume.py b/tests/integration_tests/functional/test_pause_resume.py index 68a22353de2..ab8c97ab7fb 100644 --- a/tests/integration_tests/functional/test_pause_resume.py +++ b/tests/integration_tests/functional/test_pause_resume.py @@ -41,7 +41,6 @@ def test_pause_resume(uvm_nano): microvm.api.vm.patch(state="Resumed") microvm.start() - microvm.wait_for_up() # Pausing the microVM after it's been started is successful. microvm.api.vm.patch(state="Paused") @@ -69,14 +68,12 @@ def test_pause_resume(uvm_nano): microvm.api.vm.patch(state="Resumed") # Verify guest is active again. - microvm.wait_for_up() # Resuming the microVM when it is already `Resumed` is allowed # (microVM remains in the running state). microvm.api.vm.patch(state="Resumed") # Verify guest is still active. - microvm.wait_for_up() microvm.kill() diff --git a/tests/integration_tests/functional/test_rng.py b/tests/integration_tests/functional/test_rng.py index a52c94b7c66..b40aa66033d 100644 --- a/tests/integration_tests/functional/test_rng.py +++ b/tests/integration_tests/functional/test_rng.py @@ -87,7 +87,6 @@ def test_rng_snapshot(uvm_with_rng, microvm_factory): new_vm = microvm_factory.build() new_vm.spawn() new_vm.restore_from_snapshot(snapshot, resume=True) - new_vm.wait_for_up() assert_virtio_rng_is_current_hwrng_device(new_vm.ssh) check_entropy(new_vm.ssh) diff --git a/tests/integration_tests/functional/test_serial_io.py b/tests/integration_tests/functional/test_serial_io.py index a2066283d8f..db1521d4a44 100644 --- a/tests/integration_tests/functional/test_serial_io.py +++ b/tests/integration_tests/functional/test_serial_io.py @@ -226,6 +226,5 @@ def test_no_serial_fd_error_when_daemonized(uvm_plain): mem_size_mib=512, ) test_microvm.start() - test_microvm.wait_for_up() assert REGISTER_FAILED_WARNING not in test_microvm.log_data diff --git a/tests/integration_tests/functional/test_snapshot_basic.py b/tests/integration_tests/functional/test_snapshot_basic.py index 998d5d027af..e07175a662b 100644 --- a/tests/integration_tests/functional/test_snapshot_basic.py +++ b/tests/integration_tests/functional/test_snapshot_basic.py @@ -59,7 +59,6 @@ def test_resume_after_restoration(uvm_nano, microvm_factory): vm = uvm_nano vm.add_net_iface() vm.start() - vm.wait_for_up() snapshot = vm.snapshot_full() @@ -67,7 +66,6 @@ def test_resume_after_restoration(uvm_nano, microvm_factory): restored_vm.spawn() restored_vm.restore_from_snapshot(snapshot) restored_vm.resume() - restored_vm.wait_for_up() def test_resume_at_restoration(uvm_nano, microvm_factory): @@ -79,14 +77,12 @@ def test_resume_at_restoration(uvm_nano, microvm_factory): vm = uvm_nano vm.add_net_iface() vm.start() - vm.wait_for_up() snapshot = vm.snapshot_full() restored_vm = microvm_factory.build() restored_vm.spawn() restored_vm.restore_from_snapshot(snapshot, resume=True) - restored_vm.wait_for_up() def test_snapshot_current_version(uvm_nano): @@ -150,7 +146,6 @@ def test_5_snapshots( vm.add_net_iface() vm.api.vsock.put(vsock_id="vsock0", guest_cid=3, uds_path=VSOCK_UDS_PATH) vm.start() - vm.wait_for_up() vm_blob_path = "/tmp/vsock/test.blob" # Generate a random data file for vsock. @@ -220,7 +215,6 @@ def test_patch_drive_snapshot(uvm_nano, microvm_factory): scratch_disk1 = drive_tools.FilesystemFile(scratch_path1, size=128) basevm.add_drive("scratch", scratch_disk1.path) basevm.start() - basevm.wait_for_up() # Update drive to have another backing file, double in size. new_file_size_mb = 2 * int(scratch_disk1.size() / (1024 * 1024)) @@ -238,7 +232,6 @@ def test_patch_drive_snapshot(uvm_nano, microvm_factory): vm = microvm_factory.build() vm.spawn() vm.restore_from_snapshot(snapshot, resume=True) - vm.wait_for_up() # Attempt to connect to resumed microvm and verify the new microVM has the # right scratch drive. @@ -299,7 +292,6 @@ def test_cmp_full_and_first_diff_mem(microvm_factory, guest_kernel, rootfs): ) vm.add_net_iface() vm.start() - vm.wait_for_up() logger.info("Create diff snapshot.") # Create diff snapshot. @@ -322,7 +314,6 @@ def test_negative_postload_api(uvm_plain, microvm_factory): basevm.basic_config(track_dirty_pages=True) basevm.add_net_iface() basevm.start() - basevm.wait_for_up() # Create base snapshot. snapshot = basevm.snapshot_diff() @@ -474,7 +465,6 @@ def test_diff_snapshot_overlay(guest_kernel, rootfs, microvm_factory): basevm.basic_config(track_dirty_pages=True) basevm.add_net_iface() basevm.start() - basevm.wait_for_up() # The first snapshot taken will always contain all memory (even if its specified as "diff"). # We use a diff snapshot here, as taking a full snapshot does not clear the dirty page tracking, @@ -502,7 +492,6 @@ def test_diff_snapshot_overlay(guest_kernel, rootfs, microvm_factory): new_vm.restore_from_snapshot(merged_snapshot, resume=True) # Check that the restored VM works - new_vm.wait_for_up() def test_snapshot_overwrite_self(guest_kernel, rootfs, microvm_factory): @@ -518,7 +507,6 @@ def test_snapshot_overwrite_self(guest_kernel, rootfs, microvm_factory): base_vm.basic_config() base_vm.add_net_iface() base_vm.start() - base_vm.wait_for_up() snapshot = base_vm.snapshot_full() base_vm.kill() @@ -538,7 +526,6 @@ def test_snapshot_overwrite_self(guest_kernel, rootfs, microvm_factory): # Check the overwriting the snapshot file from which this microvm was originally # restored, with a new snapshot of this vm, does not break the VM - vm.wait_for_up() @pytest.mark.parametrize("snapshot_type", [SnapshotType.DIFF, SnapshotType.FULL]) @@ -551,7 +538,6 @@ def test_vmgenid(guest_kernel_linux_6_1, rootfs, microvm_factory, snapshot_type) base_vm.basic_config(track_dirty_pages=True) base_vm.add_net_iface() base_vm.start() - base_vm.wait_for_up() snapshot = base_vm.make_snapshot(snapshot_type) base_snapshot = snapshot diff --git a/tests/integration_tests/functional/test_snapshot_editor.py b/tests/integration_tests/functional/test_snapshot_editor.py index 3790b69f610..4d466a441ce 100644 --- a/tests/integration_tests/functional/test_snapshot_editor.py +++ b/tests/integration_tests/functional/test_snapshot_editor.py @@ -27,7 +27,6 @@ def test_remove_regs(uvm_nano, microvm_factory): vm = uvm_nano vm.add_net_iface() vm.start() - vm.wait_for_up() snapshot = vm.snapshot_full() @@ -72,4 +71,3 @@ def test_remove_regs(uvm_nano, microvm_factory): new_vm = microvm_factory.build() new_vm.spawn() new_vm.restore_from_snapshot(snapshot, resume=True) - new_vm.wait_for_up() diff --git a/tests/integration_tests/functional/test_snapshot_not_losing_dirty_pages.py b/tests/integration_tests/functional/test_snapshot_not_losing_dirty_pages.py index 9f0ed465215..812e706b926 100644 --- a/tests/integration_tests/functional/test_snapshot_not_losing_dirty_pages.py +++ b/tests/integration_tests/functional/test_snapshot_not_losing_dirty_pages.py @@ -41,7 +41,6 @@ def test_diff_snapshot_works_after_error( uvm.basic_config(mem_size_mib=vm_mem_size, track_dirty_pages=True) uvm.add_net_iface() uvm.start() - uvm.wait_for_up() chroot = Path(uvm.chroot()) @@ -68,6 +67,5 @@ def test_diff_snapshot_works_after_error( vm2 = microvm_factory.build() vm2.spawn() vm2.restore_from_snapshot(snap2, resume=True) - vm2.wait_for_up() uvm.kill() diff --git a/tests/integration_tests/functional/test_uffd.py b/tests/integration_tests/functional/test_uffd.py index 2037a3b7aac..44de52ed2d5 100644 --- a/tests/integration_tests/functional/test_uffd.py +++ b/tests/integration_tests/functional/test_uffd.py @@ -28,7 +28,6 @@ def snapshot_fxt(microvm_factory, guest_kernel_linux_5_10, rootfs_ubuntu_22): ) basevm.start() - basevm.wait_for_up() # Create base snapshot. snapshot = basevm.snapshot_full() @@ -123,7 +122,6 @@ def test_valid_handler(uvm_plain, snapshot, uffd_handler_paths): vm.api.balloon.patch(amount_mib=0) # Verify if the restored guest works. - vm.wait_for_up() def test_malicious_handler(uvm_plain, snapshot, uffd_handler_paths): diff --git a/tests/integration_tests/functional/test_vsock.py b/tests/integration_tests/functional/test_vsock.py index a09bd246e9b..95f52c670b4 100644 --- a/tests/integration_tests/functional/test_vsock.py +++ b/tests/integration_tests/functional/test_vsock.py @@ -14,6 +14,9 @@ """ import os.path +import subprocess +import time +from pathlib import Path from socket import timeout as SocketTimeout from framework.utils_vsock import ( @@ -76,7 +79,6 @@ def negative_test_host_connections(vm, blob_path, blob_hash): # Validate that guest is still up and running. # Should fail if Firecracker exited from SIGPIPE handler. - vm.wait_for_up() metrics = vm.flush_metrics() validate_fc_metrics(metrics) @@ -126,7 +128,7 @@ def test_vsock_epipe(uvm_plain, bin_vsock_path, test_fc_session_root_path): validate_fc_metrics(metrics) -def test_vsock_transport_reset( +def test_vsock_transport_reset_h2g( uvm_nano, microvm_factory, bin_vsock_path, test_fc_session_root_path ): """ @@ -203,7 +205,6 @@ def test_vsock_transport_reset( vm2 = microvm_factory.build() vm2.spawn() vm2.restore_from_snapshot(snapshot, resume=True) - vm2.wait_for_up() # Check that vsock device still works. # Test guest-initiated connections. @@ -215,3 +216,72 @@ def test_vsock_transport_reset( check_host_connections(path, blob_path, blob_hash) metrics = vm2.flush_metrics() validate_fc_metrics(metrics) + + +def test_vsock_transport_reset_g2h(uvm_nano, microvm_factory): + """ + Vsock transport reset test. + """ + test_vm = uvm_nano + test_vm.add_net_iface() + test_vm.api.vsock.put(vsock_id="vsock0", guest_cid=3, uds_path=f"/{VSOCK_UDS_PATH}") + test_vm.start() + + # Create snapshot and terminate a VM. + snapshot = test_vm.snapshot_full() + test_vm.kill() + + for _ in range(5): + # Load snapshot. + new_vm = microvm_factory.build() + new_vm.spawn() + new_vm.restore_from_snapshot(snapshot, resume=True) + + # After snap restore all vsock connections should be + # dropped. This means guest socat should exit same way + # as it did after snapshot was taken. + code, _, _ = new_vm.ssh.run("pidof socat") + assert code == 1 + + host_socket_path = os.path.join( + new_vm.path, f"{VSOCK_UDS_PATH}_{ECHO_SERVER_PORT}" + ) + host_socat_commmand = [ + "socat", + "-dddd", + f"UNIX-LISTEN:{host_socket_path},fork", + "STDOUT", + ] + host_socat = subprocess.Popen( + host_socat_commmand, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + + # Give some time for host socat to create socket + time.sleep(0.5) + assert Path(host_socket_path).exists() + new_vm.create_jailed_resource(host_socket_path) + + # Create a socat process in the guest which will connect to the host socat + guest_socat_commmand = ( + f"tmux new -d 'socat - vsock-connect:2:{ECHO_SERVER_PORT}'" + ) + new_vm.ssh.run(guest_socat_commmand) + + # socat should be running in the guest now + code, _, _ = new_vm.ssh.run("pidof socat") + assert code == 0 + + # Create snapshot. + snapshot = new_vm.snapshot_full() + new_vm.resume() + + # After `create_snapshot` + 'restore' calls, connection should be dropped + code, _, _ = new_vm.ssh.run("pidof socat") + assert code == 1 + + # Kill host socat as it is not useful anymore + host_socat.kill() + host_socat.communicate() + + # Terminate VM. + new_vm.kill() diff --git a/tests/integration_tests/performance/test_huge_pages.py b/tests/integration_tests/performance/test_huge_pages.py index ddd9fca20df..ff13097a9db 100644 --- a/tests/integration_tests/performance/test_huge_pages.py +++ b/tests/integration_tests/performance/test_huge_pages.py @@ -62,7 +62,6 @@ def test_hugetlbfs_boot(uvm_plain): uvm_plain.basic_config(huge_pages=HugePagesConfig.HUGETLBFS_2MB, mem_size_mib=128) uvm_plain.add_net_iface() uvm_plain.start() - uvm_plain.wait_for_up() check_hugetlbfs_in_use( uvm_plain.firecracker_pid, @@ -84,7 +83,6 @@ def test_hugetlbfs_snapshot( vm.basic_config(huge_pages=HugePagesConfig.HUGETLBFS_2MB, mem_size_mib=128) vm.add_net_iface() vm.start() - vm.wait_for_up() check_hugetlbfs_in_use(vm.firecracker_pid, "/anon_hugepage") @@ -102,7 +100,6 @@ def test_hugetlbfs_snapshot( ) vm.restore_from_snapshot(snapshot, resume=True, uffd_path=SOCKET_PATH) - vm.wait_for_up() check_hugetlbfs_in_use(vm.firecracker_pid, "/anon_hugepage") @@ -126,7 +123,6 @@ def test_hugetlbfs_diff_snapshot(microvm_factory, uvm_plain, uffd_handler_paths) uvm_plain.start() # Wait for microvm to boot - uvm_plain.wait_for_up() base_snapshot = uvm_plain.snapshot_diff() uvm_plain.resume() @@ -151,7 +147,6 @@ def test_hugetlbfs_diff_snapshot(microvm_factory, uvm_plain, uffd_handler_paths) vm.restore_from_snapshot(snapshot_merged, resume=True, uffd_path=SOCKET_PATH) # Verify if the restored microvm works. - vm.wait_for_up() @pytest.mark.parametrize("huge_pages", HugePagesConfig) @@ -214,7 +209,6 @@ def test_ept_violation_count( with ftrace_events("kvm:*"): vm.restore_from_snapshot(snapshot, resume=True, uffd_path=SOCKET_PATH) - vm.wait_for_up() # Verify if guest can run commands, and also wake up the fast page fault helper to trigger page faults. rc, _, _ = vm.ssh.run(f"kill -s {signal.SIGUSR1} {pid}") diff --git a/tests/integration_tests/performance/test_memory_overhead.py b/tests/integration_tests/performance/test_memory_overhead.py index c98ed269e09..a2ebf607d2a 100644 --- a/tests/integration_tests/performance/test_memory_overhead.py +++ b/tests/integration_tests/performance/test_memory_overhead.py @@ -45,7 +45,6 @@ def test_memory_overhead( metrics.set_dimensions( {"performance_test": "test_memory_overhead", **microvm.dimensions} ) - microvm.wait_for_up() guest_mem_bytes = mem_size_mib * 2**20 guest_mem_splits = { diff --git a/tests/integration_tests/performance/test_snapshot_ab.py b/tests/integration_tests/performance/test_snapshot_ab.py index 82ad177a31d..837503f0a41 100644 --- a/tests/integration_tests/performance/test_snapshot_ab.py +++ b/tests/integration_tests/performance/test_snapshot_ab.py @@ -90,8 +90,6 @@ def sample_latency( microvm.spawn(emit_metrics=True) snapshot_copy = microvm.restore_from_snapshot(snapshot, resume=True) - microvm.wait_for_up() - value = 0 # Parse all metric data points in search of load_snapshot time. microvm.flush_metrics() @@ -138,7 +136,6 @@ def test_restore_latency( """ vm = test_setup.configure_vm(microvm_factory, guest_kernel_linux_4_14, rootfs) vm.start() - vm.wait_for_up() metrics.set_dimensions( { diff --git a/tests/integration_tests/security/test_jail.py b/tests/integration_tests/security/test_jail.py index a0477313c6a..68bb220e0c9 100644 --- a/tests/integration_tests/security/test_jail.py +++ b/tests/integration_tests/security/test_jail.py @@ -604,7 +604,6 @@ def test_firecracker_kill_by_pid(uvm_plain, daemonize, new_pid_ns): microvm.basic_config() microvm.add_net_iface() microvm.start() - microvm.wait_for_up() # before killing microvm make sure the Jailer config is what we set it to be. assert ( diff --git a/tests/integration_tests/security/test_vulnerabilities.py b/tests/integration_tests/security/test_vulnerabilities.py index aea71d5c6b5..a7faf762acd 100644 --- a/tests/integration_tests/security/test_vulnerabilities.py +++ b/tests/integration_tests/security/test_vulnerabilities.py @@ -114,7 +114,6 @@ def with_restore(factory, microvm_factory): def restore(firecracker=None, jailer=None): microvm = factory(firecracker, jailer) - microvm.wait_for_up() snapshot = microvm.snapshot_full() @@ -127,7 +126,6 @@ def restore(firecracker=None, jailer=None): dst_vm.spawn() # Restore the destination VM from the snapshot dst_vm.restore_from_snapshot(snapshot, resume=True) - dst_vm.wait_for_up() dst_vm.cpu_template = microvm.cpu_template return dst_vm diff --git a/tools/create_snapshot_artifact/main.py b/tools/create_snapshot_artifact/main.py index 3bf7b6d9e9f..75d439c1185 100755 --- a/tools/create_snapshot_artifact/main.py +++ b/tools/create_snapshot_artifact/main.py @@ -120,8 +120,6 @@ def main(): ) vm.start() - # Ensure the microVM has started. - assert vm.state == "Running" # Populate MMDS. data_store = {