diff --git a/Cargo.lock b/Cargo.lock index 6e2ae545440..899cdb112b9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -98,7 +98,7 @@ version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6c8bdeb6047d8983be085bab0ba1472e6dc604e7041dbf6fcd5e71523014fae9" dependencies = [ - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -109,9 +109,15 @@ checksum = "403f75924867bb1033c59fbf0797484329750cfbe3c4325cd33127941fabc882" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys", + "windows-sys 0.59.0", ] +[[package]] +name = "anyhow" +version = "1.0.98" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" + [[package]] name = "arrayvec" version = "0.7.6" @@ -218,7 +224,7 @@ dependencies = [ "bitflags 2.9.1", "cexpr", "clang-sys", - "itertools 0.10.5", + "itertools 0.12.1", "lazy_static", "lazycell", "log", @@ -250,6 +256,12 @@ version = "3.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "cargo_toml" version = "0.22.3" @@ -589,7 +601,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "778e2ac28f6c47af28e4907f13ffd1e1ddbd400980a9abd7c8df189bf578a5ad" dependencies = [ "libc", - "windows-sys", + "windows-sys 0.60.2", ] [[package]] @@ -721,7 +733,7 @@ version = "0.5.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589533453244b0995c858700322199b2becb13b627df2851f64a2775d024abcf" dependencies = [ - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -751,9 +763,9 @@ checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" [[package]] name = "itertools" -version = "0.10.5" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" dependencies = [ "either", ] @@ -886,7 +898,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667" dependencies = [ "cfg-if", - "windows-targets", + "windows-targets 0.53.2", ] [[package]] @@ -1027,6 +1039,23 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" +[[package]] +name = "pci" +version = "0.1.0" +dependencies = [ + "byteorder", + "displaydoc", + "libc", + "log", + "serde", + "serde_test", + "thiserror 2.0.12", + "vm-allocator", + "vm-device", + "vm-memory", + "vmm-sys-util", +] + [[package]] name = "peeking_take_while" version = "0.1.2" @@ -1214,7 +1243,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -1302,6 +1331,15 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_test" +version = "1.0.177" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f901ee573cab6b3060453d2d5f0bae4e6d628c23c0a962ff9b5f1d7c8d4f1ed" +dependencies = [ + "serde", +] + [[package]] name = "shlex" version = "1.3.0" @@ -1582,9 +1620,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "040a65b0c29f298d71ca45dd52d02b0d0ddc15b9b97d95dfeebe67d6fdd42a28" dependencies = [ "libc", + "serde", "thiserror 2.0.12", ] +[[package]] +name = "vm-device" +version = "0.1.0" +dependencies = [ + "serde", + "vmm-sys-util", +] + [[package]] name = "vm-fdt" version = "0.3.0" @@ -1614,11 +1661,13 @@ version = "0.1.0" dependencies = [ "acpi_tables", "aes-gcm", + "anyhow", "arrayvec", "aws-lc-rs", "base64", "bincode", "bitflags 2.9.1", + "byteorder", "crc64", "criterion", "derive_more", @@ -1636,6 +1685,7 @@ dependencies = [ "log-instrument", "memfd", "micro_http", + "pci", "proptest", "semver", "serde", @@ -1645,8 +1695,10 @@ dependencies = [ "timerfd", "userfaultfd", "utils", + "uuid", "vhost", "vm-allocator", + "vm-device", "vm-fdt", "vm-memory", "vm-superio", @@ -1777,7 +1829,7 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" dependencies = [ - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -1792,7 +1844,16 @@ version = "0.59.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" dependencies = [ - "windows-targets", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" +dependencies = [ + "windows-targets 0.53.2", ] [[package]] @@ -1801,14 +1862,30 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_gnullvm", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm 0.52.6", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.53.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c66f69fcc9ce11da9966ddb31a40968cad001c5bedeb5c2b82ede4253ab48aef" +dependencies = [ + "windows_aarch64_gnullvm 0.53.0", + "windows_aarch64_msvc 0.53.0", + "windows_i686_gnu 0.53.0", + "windows_i686_gnullvm 0.53.0", + "windows_i686_msvc 0.53.0", + "windows_x86_64_gnu 0.53.0", + "windows_x86_64_gnullvm 0.53.0", + "windows_x86_64_msvc 0.53.0", ] [[package]] @@ -1817,48 +1894,96 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764" + [[package]] name = "windows_aarch64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c" + [[package]] name = "windows_i686_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" +[[package]] +name = "windows_i686_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3" + [[package]] name = "windows_i686_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11" + [[package]] name = "windows_i686_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" +[[package]] +name = "windows_i686_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d" + [[package]] name = "windows_x86_64_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba" + [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57" + [[package]] name = "windows_x86_64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" + [[package]] name = "winnow" version = "0.7.12" diff --git a/Cargo.toml b/Cargo.toml index 37a76cdd34f..a1c9ad79621 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,6 +29,7 @@ exit = "warn" tests_outside_test_module = "warn" assertions_on_result_states = "warn" error_impl_error = "warn" +or_fun_call = "warn" [profile.dev] panic = "abort" diff --git a/resources/chroot.sh b/resources/chroot.sh index 93f6ca754f0..7aadeddb884 100755 --- a/resources/chroot.sh +++ b/resources/chroot.sh @@ -11,7 +11,7 @@ PS4='+\t ' cp -ruv $rootfs/* / -packages="udev systemd-sysv openssh-server iproute2 curl socat python3-minimal iperf3 iputils-ping fio kmod tmux hwloc-nox vim-tiny trace-cmd linuxptp strace python3-boto3" +packages="udev systemd-sysv openssh-server iproute2 curl socat python3-minimal iperf3 iputils-ping fio kmod tmux hwloc-nox vim-tiny trace-cmd linuxptp strace python3-boto3 pciutils" # msr-tools is only supported on x86-64. arch=$(uname -m) @@ -64,6 +64,10 @@ rm -vf /etc/systemd/system/timers.target.wants/* systemctl enable var-lib-systemd.mount +# disable Predictable Network Interface Names to keep ethN names +# even with PCI enabled +ln -s /dev/null /etc/systemd/network/99-default.link + #### trim image https://wiki.ubuntu.com/ReducingDiskFootprint # this does not save much, but oh well rm -rf /usr/share/{doc,man,info,locale} diff --git a/resources/guest_configs/pcie.config b/resources/guest_configs/pcie.config new file mode 100644 index 00000000000..b7262f7ae73 --- /dev/null +++ b/resources/guest_configs/pcie.config @@ -0,0 +1,8 @@ +CONFIG_BLK_MQ_PCI=y +CONFIG_PCI=y +CONFIG_PCI_MMCONFIG=y +CONFIG_PCI_MSI=y +CONFIG_PCIEPORTBUS=y +CONFIG_VIRTIO_PCI=y +CONFIG_PCI_HOST_COMMON=y +CONFIG_PCI_HOST_GENERIC=y diff --git a/resources/overlay/etc/systemd/system/fcnet.service b/resources/overlay/etc/systemd/system/fcnet.service index 26d3af1dc20..ace1c8322e1 100644 --- a/resources/overlay/etc/systemd/system/fcnet.service +++ b/resources/overlay/etc/systemd/system/fcnet.service @@ -1,5 +1,6 @@ [Service] Type=oneshot +ExecStartPre=/usr/bin/udevadm settle ExecStart=/usr/local/bin/fcnet-setup.sh [Install] WantedBy=sshd.service diff --git a/resources/overlay/usr/local/bin/init.c b/resources/overlay/usr/local/bin/init.c index caa3e9d91d5..4d469171ae5 100644 --- a/resources/overlay/usr/local/bin/init.c +++ b/resources/overlay/usr/local/bin/init.c @@ -13,7 +13,7 @@ // Position on the bus is defined by MMIO_LEN increments, where MMIO_LEN is // defined as 0x1000 in vmm/src/device_manager/mmio.rs. #ifdef __x86_64__ -#define MAGIC_MMIO_SIGNAL_GUEST_BOOT_COMPLETE 0xd0000000 +#define MAGIC_MMIO_SIGNAL_GUEST_BOOT_COMPLETE 0xc0000000 #endif #ifdef __aarch64__ #define MAGIC_MMIO_SIGNAL_GUEST_BOOT_COMPLETE 0x40000000 diff --git a/resources/rebuild.sh b/resources/rebuild.sh index f7215af371e..38313b6d0f0 100755 --- a/resources/rebuild.sh +++ b/resources/rebuild.sh @@ -109,7 +109,7 @@ function build_initramfs { # Report guest boot time back to Firecracker via MMIO # See arch/src/lib.rs and the BootTimer device - MAGIC_BOOT_ADDRESS=0xd0000000 + MAGIC_BOOT_ADDRESS=0xc0000000 if [ $ARCH = "aarch64" ]; then MAGIC_BOOT_ADDRESS=0x40000000 fi @@ -247,15 +247,16 @@ function build_al_kernels { clone_amazon_linux_repo CI_CONFIG="$PWD/guest_configs/ci.config" + PCIE_CONFIG="$PWD/guest_configs/pcie.config" if [[ "$KERNEL_VERSION" == @(all|5.10) ]]; then - build_al_kernel $PWD/guest_configs/microvm-kernel-ci-$ARCH-5.10.config "$CI_CONFIG" + build_al_kernel $PWD/guest_configs/microvm-kernel-ci-$ARCH-5.10.config "$CI_CONFIG" "$PCIE_CONFIG" fi if [[ $ARCH == "x86_64" && "$KERNEL_VERSION" == @(all|5.10-no-acpi) ]]; then - build_al_kernel $PWD/guest_configs/microvm-kernel-ci-$ARCH-5.10-no-acpi.config "$CI_CONFIG" + build_al_kernel $PWD/guest_configs/microvm-kernel-ci-$ARCH-5.10-no-acpi.config "$CI_CONFIG" "$PCIE_CONFIG" fi if [[ "$KERNEL_VERSION" == @(all|6.1) ]]; then - build_al_kernel $PWD/guest_configs/microvm-kernel-ci-$ARCH-6.1.config "$CI_CONFIG" + build_al_kernel $PWD/guest_configs/microvm-kernel-ci-$ARCH-6.1.config "$CI_CONFIG" "$PCIE_CONFIG" fi # Build debug kernels @@ -264,11 +265,11 @@ function build_al_kernels { OUTPUT_DIR=$OUTPUT_DIR/debug mkdir -pv $OUTPUT_DIR if [[ "$KERNEL_VERSION" == @(all|5.10) ]]; then - build_al_kernel "$PWD/guest_configs/microvm-kernel-ci-$ARCH-5.10.config" "$CI_CONFIG" "$FTRACE_CONFIG" "$DEBUG_CONFIG" + build_al_kernel "$PWD/guest_configs/microvm-kernel-ci-$ARCH-5.10.config" "$CI_CONFIG" "$PCIE_CONFIG" "$FTRACE_CONFIG" "$DEBUG_CONFIG" vmlinux_split_debuginfo $OUTPUT_DIR/vmlinux-5.10.* fi if [[ "$KERNEL_VERSION" == @(all|6.1) ]]; then - build_al_kernel "$PWD/guest_configs/microvm-kernel-ci-$ARCH-6.1.config" "$CI_CONFIG" "$FTRACE_CONFIG" "$DEBUG_CONFIG" + build_al_kernel "$PWD/guest_configs/microvm-kernel-ci-$ARCH-6.1.config" "$CI_CONFIG" "$PCIE_CONFIG" "$FTRACE_CONFIG" "$DEBUG_CONFIG" vmlinux_split_debuginfo $OUTPUT_DIR/vmlinux-6.1.* fi } diff --git a/resources/seccomp/aarch64-unknown-linux-musl.json b/resources/seccomp/aarch64-unknown-linux-musl.json index 8a3dac13673..433528b8f29 100644 --- a/resources/seccomp/aarch64-unknown-linux-musl.json +++ b/resources/seccomp/aarch64-unknown-linux-musl.json @@ -1020,6 +1020,49 @@ { "syscall": "restart_syscall", "comment": "automatically issued by the kernel when specific timing-related syscalls (e.g. nanosleep) get interrupted by SIGSTOP" + }, + { + "syscall": "ioctl", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 44547, + "comment": "KVM_CHECK_EXTENSION" + }, + { + "index": 2, + "type": "dword", + "op": "eq", + "val": 131, + "comment": "KVM_CAP_MSI_DEVID" + } + ] + }, + { + "syscall": "ioctl", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 1074310762, + "comment": "KVM_SET_GSI_ROUTING" + } + ] + }, + { + "syscall": "ioctl", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 1075883638, + "comment": "KVM_IRQFD" + } + ] } ] } diff --git a/resources/seccomp/x86_64-unknown-linux-musl.json b/resources/seccomp/x86_64-unknown-linux-musl.json index c3462d2f86b..14f2a26bafd 100644 --- a/resources/seccomp/x86_64-unknown-linux-musl.json +++ b/resources/seccomp/x86_64-unknown-linux-musl.json @@ -1152,6 +1152,49 @@ { "syscall": "restart_syscall", "comment": "automatically issued by the kernel when specific timing-related syscalls (e.g. nanosleep) get interrupted by SIGSTOP" + }, + { + "syscall": "ioctl", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 44547, + "comment": "KVM_CHECK_EXTENSION" + }, + { + "index": 2, + "type": "dword", + "op": "eq", + "val": 131, + "comment": "KVM_CAP_MSI_DEVID" + } + ] + }, + { + "syscall": "ioctl", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 1074310762, + "comment": "KVM_SET_GSI_ROUTING" + } + ] + }, + { + "syscall": "ioctl", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 1075883638, + "comment": "KVM_IRQFD" + } + ] } ] } diff --git a/src/acpi-tables/src/lib.rs b/src/acpi-tables/src/lib.rs index 321328047ed..d3b7df0791e 100644 --- a/src/acpi-tables/src/lib.rs +++ b/src/acpi-tables/src/lib.rs @@ -10,6 +10,7 @@ pub mod aml; pub mod dsdt; pub mod fadt; pub mod madt; +pub mod mcfg; pub mod rsdp; pub mod xsdt; @@ -17,6 +18,7 @@ pub use aml::Aml; pub use dsdt::Dsdt; pub use fadt::Fadt; pub use madt::Madt; +pub use mcfg::Mcfg; pub use rsdp::Rsdp; pub use xsdt::Xsdt; use zerocopy::little_endian::{U32, U64}; @@ -89,7 +91,7 @@ pub struct SdtHeader { pub oem_table_id: [u8; 8], pub oem_revision: U32, pub creator_id: [u8; 4], - pub creator_revison: U32, + pub creator_revision: U32, } impl SdtHeader { @@ -110,7 +112,7 @@ impl SdtHeader { oem_table_id, oem_revision: U32::new(oem_revision), creator_id: FC_ACPI_CREATOR_ID, - creator_revison: U32::new(FC_ACPI_CREATOR_REVISION), + creator_revision: U32::new(FC_ACPI_CREATOR_REVISION), } } } diff --git a/src/acpi-tables/src/mcfg.rs b/src/acpi-tables/src/mcfg.rs new file mode 100644 index 00000000000..a5dd8b9d227 --- /dev/null +++ b/src/acpi-tables/src/mcfg.rs @@ -0,0 +1,77 @@ +// Copyright © 2019 Intel Corporation +// Copyright © 2023 Rivos, Inc. +// Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +use std::mem::size_of; + +use vm_memory::{Bytes, GuestAddress, GuestMemory}; +use zerocopy::{Immutable, IntoBytes}; + +use crate::{Result, Sdt, SdtHeader, checksum}; + +#[allow(dead_code)] +#[repr(C, packed)] +#[derive(Default, Debug, IntoBytes, Clone, Copy, Immutable)] +struct PciRangeEntry { + pub base_address: u64, + pub segment: u16, + pub start: u8, + pub end: u8, + _reserved: u32, +} + +#[allow(dead_code)] +#[repr(C, packed)] +#[derive(Clone, Copy, Debug, Default, IntoBytes, Immutable)] +pub struct Mcfg { + header: SdtHeader, + _reserved: u64, + pci_range_entry: PciRangeEntry, +} + +impl Mcfg { + pub fn new( + oem_id: [u8; 6], + oem_table_id: [u8; 8], + oem_revision: u32, + pci_mmio_config_addr: u64, + ) -> Self { + let header = SdtHeader::new( + *b"MCFG", + size_of::().try_into().unwrap(), + 1, + oem_id, + oem_table_id, + oem_revision, + ); + + let mut mcfg = Mcfg { + header, + pci_range_entry: PciRangeEntry { + base_address: pci_mmio_config_addr, + segment: 0, + start: 0, + end: 0, + ..Default::default() + }, + ..Default::default() + }; + + mcfg.header.checksum = checksum(&[mcfg.as_bytes()]); + + mcfg + } +} + +impl Sdt for Mcfg { + fn len(&self) -> usize { + self.as_bytes().len() + } + + fn write_to_guest(&mut self, mem: &M, address: GuestAddress) -> Result<()> { + mem.write_slice(self.as_bytes(), address)?; + Ok(()) + } +} diff --git a/src/clippy-tracing/src/main.rs b/src/clippy-tracing/src/main.rs index c89fb6a5d37..721fca12b25 100644 --- a/src/clippy-tracing/src/main.rs +++ b/src/clippy-tracing/src/main.rs @@ -260,7 +260,7 @@ impl Error for ExecError {} fn exec() -> Result, ExecError> { let args = CommandLineArgs::parse(); - let path = args.path.unwrap_or(PathBuf::from(".")); + let path = args.path.unwrap_or_else(|| PathBuf::from(".")); for entry_res in WalkDir::new(path).follow_links(true) { let entry = entry_res.map_err(ExecError::Entry)?; let entry_path = entry.into_path(); diff --git a/src/cpu-template-helper/src/template/verify/mod.rs b/src/cpu-template-helper/src/template/verify/mod.rs index 1a83f6ba1b2..1f42e2f06cc 100644 --- a/src/cpu-template-helper/src/template/verify/mod.rs +++ b/src/cpu-template-helper/src/template/verify/mod.rs @@ -43,7 +43,7 @@ where for (key, template_value_filter) in template { let config_value_filter = config .get(&key) - .ok_or(VerifyError::KeyNotFound(key.to_string()))?; + .ok_or_else(|| VerifyError::KeyNotFound(key.to_string()))?; let template_value = template_value_filter.value & template_value_filter.filter; let config_value = config_value_filter.value & template_value_filter.filter; diff --git a/src/firecracker/Cargo.toml b/src/firecracker/Cargo.toml index c7e6c6a5d2e..74812a0f66d 100644 --- a/src/firecracker/Cargo.toml +++ b/src/firecracker/Cargo.toml @@ -42,7 +42,10 @@ serde_json = "1.0.142" [dev-dependencies] cargo_toml = "0.22.3" libc = "0.2.174" -regex = { version = "1.11.1", default-features = false, features = ["std", "unicode-perl"] } +regex = { version = "1.11.1", default-features = false, features = [ + "std", + "unicode-perl", +] } # Dev-Dependencies for uffd examples serde = { version = "1.0.219", features = ["derive"] } diff --git a/src/firecracker/src/api_server_adapter.rs b/src/firecracker/src/api_server_adapter.rs index 173ef298265..f597a5f7db9 100644 --- a/src/firecracker/src/api_server_adapter.rs +++ b/src/firecracker/src/api_server_adapter.rs @@ -143,6 +143,7 @@ pub(crate) fn run_with_api( instance_info: InstanceInfo, process_time_reporter: ProcessTimeReporter, boot_timer_enabled: bool, + pci_enabled: bool, api_payload_limit: usize, mmds_size_limit: usize, metadata_json: Option<&str>, @@ -212,6 +213,7 @@ pub(crate) fn run_with_api( json, instance_info, boot_timer_enabled, + pci_enabled, mmds_size_limit, metadata_json, ) @@ -224,6 +226,7 @@ pub(crate) fn run_with_api( &to_api, &api_event_fd, boot_timer_enabled, + pci_enabled, mmds_size_limit, metadata_json, ) diff --git a/src/firecracker/src/main.rs b/src/firecracker/src/main.rs index 6b01f776729..3e6ad35d6a9 100644 --- a/src/firecracker/src/main.rs +++ b/src/firecracker/src/main.rs @@ -260,6 +260,11 @@ fn main_exec() -> Result<(), MainError> { Argument::new("mmds-size-limit") .takes_value(true) .help("Mmds data store limit, in bytes."), + ) + .arg( + Argument::new("enable-pci") + .takes_value(false) + .help("Enables PCIe support."), ); arg_parser.parse_from_cmdline()?; @@ -369,6 +374,7 @@ fn main_exec() -> Result<(), MainError> { .map(|x| x.expect("Unable to open or read from the mmds content file")); let boot_timer_enabled = arguments.flag_present("boot-timer"); + let pci_enabled = arguments.flag_present("enable-pci"); let api_enabled = !arguments.flag_present("no-api"); let api_payload_limit = arg_parser .arguments() @@ -422,6 +428,7 @@ fn main_exec() -> Result<(), MainError> { instance_info, process_time_reporter, boot_timer_enabled, + pci_enabled, api_payload_limit, mmds_size_limit, metadata_json.as_deref(), @@ -437,6 +444,7 @@ fn main_exec() -> Result<(), MainError> { vmm_config_json, instance_info, boot_timer_enabled, + pci_enabled, mmds_size_limit, metadata_json.as_deref(), ) @@ -449,7 +457,7 @@ fn main_exec() -> Result<(), MainError> { /// the default the jailer would set). /// /// We do this resizing because the kernel default is 64, with a reallocation happening whenever -/// the tabel fills up. This was happening for some larger microVMs, and reallocating the +/// the table fills up. This was happening for some larger microVMs, and reallocating the /// fdtable while a lot of file descriptors are active (due to being eventfds/timerfds registered /// to epoll) incurs a penalty of 30ms-70ms on the snapshot restore path. fn resize_fdtable() -> Result<(), ResizeFdTableError> { @@ -554,12 +562,14 @@ pub enum BuildFromJsonError { } // Configure and start a microVM as described by the command-line JSON. +#[allow(clippy::too_many_arguments)] fn build_microvm_from_json( seccomp_filters: &BpfThreadMap, event_manager: &mut EventManager, config_json: String, instance_info: InstanceInfo, boot_timer_enabled: bool, + pci_enabled: bool, mmds_size_limit: usize, metadata_json: Option<&str>, ) -> Result<(VmResources, Arc>), BuildFromJsonError> { @@ -567,6 +577,7 @@ fn build_microvm_from_json( VmResources::from_json(&config_json, &instance_info, mmds_size_limit, metadata_json) .map_err(BuildFromJsonError::ParseFromJson)?; vm_resources.boot_timer = boot_timer_enabled; + vm_resources.pci_enabled = pci_enabled; let vmm = vmm::builder::build_and_boot_microvm( &instance_info, &vm_resources, @@ -593,6 +604,7 @@ fn run_without_api( config_json: Option, instance_info: InstanceInfo, bool_timer_enabled: bool, + pci_enabled: bool, mmds_size_limit: usize, metadata_json: Option<&str>, ) -> Result<(), RunWithoutApiError> { @@ -610,6 +622,7 @@ fn run_without_api( config_json.unwrap(), instance_info, bool_timer_enabled, + pci_enabled, mmds_size_limit, metadata_json, ) diff --git a/src/pci/Cargo.toml b/src/pci/Cargo.toml new file mode 100644 index 00000000000..17dc30fcd6d --- /dev/null +++ b/src/pci/Cargo.toml @@ -0,0 +1,30 @@ +[package] +authors = ["Samuel Ortiz "] +edition = "2021" +name = "pci" +version = "0.1.0" +license = "Apache-2.0 AND BSD-3-Clause" + +[lib] +bench = false + +[features] +default = [] + +[dependencies] +byteorder = "1.5.0" +displaydoc = "0.2.5" +libc = "0.2.172" +log = "0.4.27" +serde = { version = "1.0.219", features = ["derive"] } +thiserror = "2.0.12" +vm-allocator = "0.1.3" +vm-device = { path = "../vm-device" } +vm-memory = { version = "0.16.1", features = [ + "backend-mmap", + "backend-bitmap", +] } + +[dev-dependencies] +serde_test = "1.0.177" +vmm-sys-util = "0.14.0" diff --git a/src/pci/src/bus.rs b/src/pci/src/bus.rs new file mode 100644 index 00000000000..01c9b1f1933 --- /dev/null +++ b/src/pci/src/bus.rs @@ -0,0 +1,950 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +use std::collections::HashMap; +use std::ops::DerefMut; +use std::sync::{Arc, Barrier, Mutex}; + +use byteorder::{ByteOrder, LittleEndian}; +use vm_device::BusDevice; + +use crate::configuration::{PciBridgeSubclass, PciClassCode, PciConfiguration, PciHeaderType}; +use crate::device::{DeviceRelocation, Error as PciDeviceError, PciDevice}; + +const VENDOR_ID_INTEL: u16 = 0x8086; +const DEVICE_ID_INTEL_VIRT_PCIE_HOST: u16 = 0x0d57; +const NUM_DEVICE_IDS: usize = 32; + +/// Errors for device manager. +#[derive(Debug, thiserror::Error, displaydoc::Display)] +pub enum PciRootError { + /// Could not allocate device address space for the device. + AllocateDeviceAddrs(PciDeviceError), + /// Could not allocate an IRQ number. + AllocateIrq, + /// Could not add a device to the port io bus. + PioInsert(vm_device::BusError), + /// Could not add a device to the mmio bus. + MmioInsert(vm_device::BusError), + /// Could not find an available device slot on the PCI bus. + NoPciDeviceSlotAvailable, + /// Invalid PCI device identifier provided. + InvalidPciDeviceSlot(usize), + /// Valid PCI device identifier but already used. + AlreadyInUsePciDeviceSlot(usize), +} +pub type Result = std::result::Result; + +/// Emulates the PCI Root bridge device. +pub struct PciRoot { + /// Configuration space. + config: PciConfiguration, +} + +impl PciRoot { + /// Create an empty PCI root bridge. + pub fn new(config: Option) -> Self { + if let Some(config) = config { + PciRoot { config } + } else { + PciRoot { + config: PciConfiguration::new( + VENDOR_ID_INTEL, + DEVICE_ID_INTEL_VIRT_PCIE_HOST, + 0, + PciClassCode::BridgeDevice, + &PciBridgeSubclass::HostBridge, + None, + PciHeaderType::Device, + 0, + 0, + None, + None, + ), + } + } + } +} + +impl BusDevice for PciRoot {} + +impl PciDevice for PciRoot { + fn write_config_register( + &mut self, + reg_idx: usize, + offset: u64, + data: &[u8], + ) -> Option> { + self.config.write_config_register(reg_idx, offset, data); + None + } + + fn read_config_register(&mut self, reg_idx: usize) -> u32 { + self.config.read_reg(reg_idx) + } +} + +pub struct PciBus { + /// Devices attached to this bus. + /// Device 0 is host bridge. + pub devices: HashMap>>, + device_reloc: Arc, + device_ids: Vec, +} + +impl PciBus { + pub fn new(pci_root: PciRoot, device_reloc: Arc) -> Self { + let mut devices: HashMap>> = HashMap::new(); + let mut device_ids: Vec = vec![false; NUM_DEVICE_IDS]; + + devices.insert(0, Arc::new(Mutex::new(pci_root))); + device_ids[0] = true; + + PciBus { + devices, + device_reloc, + device_ids, + } + } + + pub fn add_device(&mut self, device_id: u32, device: Arc>) -> Result<()> { + self.devices.insert(device_id, device); + Ok(()) + } + + pub fn next_device_id(&mut self) -> Result { + for (idx, device_id) in self.device_ids.iter_mut().enumerate() { + if !(*device_id) { + *device_id = true; + return Ok(idx as u32); + } + } + + Err(PciRootError::NoPciDeviceSlotAvailable) + } +} + +pub struct PciConfigIo { + /// Config space register. + config_address: u32, + pci_bus: Arc>, +} + +impl PciConfigIo { + pub fn new(pci_bus: Arc>) -> Self { + PciConfigIo { + config_address: 0, + pci_bus, + } + } + + pub fn config_space_read(&self) -> u32 { + let enabled = (self.config_address & 0x8000_0000) != 0; + if !enabled { + return 0xffff_ffff; + } + + let (bus, device, function, register) = + parse_io_config_address(self.config_address & !0x8000_0000); + + // Only support one bus. + if bus != 0 { + return 0xffff_ffff; + } + + // Don't support multi-function devices. + if function > 0 { + return 0xffff_ffff; + } + + self.pci_bus + .as_ref() + .lock() + .unwrap() + .devices + .get(&(device as u32)) + .map_or(0xffff_ffff, |d| { + d.lock().unwrap().read_config_register(register) + }) + } + + pub fn config_space_write(&mut self, offset: u64, data: &[u8]) -> Option> { + if offset as usize + data.len() > 4 { + return None; + } + + let enabled = (self.config_address & 0x8000_0000) != 0; + if !enabled { + return None; + } + + let (bus, device, function, register) = + parse_io_config_address(self.config_address & !0x8000_0000); + + // Only support one bus. + if bus != 0 { + return None; + } + + // Don't support multi-function devices. + if function > 0 { + return None; + } + + let pci_bus = self.pci_bus.as_ref().lock().unwrap(); + if let Some(d) = pci_bus.devices.get(&(device as u32)) { + let mut device = d.lock().unwrap(); + + // Find out if one of the device's BAR is being reprogrammed, and + // reprogram it if needed. + if let Some(params) = device.detect_bar_reprogramming(register, data) { + if let Err(e) = pci_bus.device_reloc.move_bar( + params.old_base, + params.new_base, + params.len, + device.deref_mut(), + params.region_type, + ) { + error!( + "Failed moving device BAR: {}: 0x{:x}->0x{:x}(0x{:x})", + e, params.old_base, params.new_base, params.len + ); + } + } + + // Update the register value + device.write_config_register(register, offset, data) + } else { + None + } + } + + fn set_config_address(&mut self, offset: u64, data: &[u8]) { + if offset as usize + data.len() > 4 { + return; + } + let (mask, value): (u32, u32) = match data.len() { + 1 => ( + 0x0000_00ff << (offset * 8), + u32::from(data[0]) << (offset * 8), + ), + 2 => ( + 0x0000_ffff << (offset * 8), + ((u32::from(data[1]) << 8) | u32::from(data[0])) << (offset * 8), + ), + 4 => (0xffff_ffff, LittleEndian::read_u32(data)), + _ => return, + }; + self.config_address = (self.config_address & !mask) | value; + } +} + +impl BusDevice for PciConfigIo { + fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { + // Only allow reads to the register boundary. + let start = offset as usize % 4; + let end = start + data.len(); + if end > 4 { + for d in data.iter_mut() { + *d = 0xff; + } + return; + } + + // `offset` is relative to 0xcf8 + let value = match offset { + 0..=3 => self.config_address, + 4..=7 => self.config_space_read(), + _ => 0xffff_ffff, + }; + + for i in start..end { + data[i - start] = (value >> (i * 8)) as u8; + } + } + + fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { + // `offset` is relative to 0xcf8 + match offset { + o @ 0..=3 => { + self.set_config_address(o, data); + None + } + o @ 4..=7 => self.config_space_write(o - 4, data), + _ => None, + } + } +} + +/// Emulates PCI memory-mapped configuration access mechanism. +pub struct PciConfigMmio { + pci_bus: Arc>, +} + +impl PciConfigMmio { + pub fn new(pci_bus: Arc>) -> Self { + PciConfigMmio { pci_bus } + } + + fn config_space_read(&self, config_address: u32) -> u32 { + let (bus, device, function, register) = parse_mmio_config_address(config_address); + + // Only support one bus. + if bus != 0 { + return 0xffff_ffff; + } + + // Don't support multi-function devices. + if function > 0 { + return 0xffff_ffff; + } + + self.pci_bus + .lock() + .unwrap() + .devices + .get(&(device as u32)) + .map_or(0xffff_ffff, |d| { + d.lock().unwrap().read_config_register(register) + }) + } + + fn config_space_write(&mut self, config_address: u32, offset: u64, data: &[u8]) { + if offset as usize + data.len() > 4 { + return; + } + + let (bus, device, function, register) = parse_mmio_config_address(config_address); + + // Only support one bus. + if bus != 0 { + return; + } + + // Don't support multi-function devices. + if function > 0 { + return; + } + + let pci_bus = self.pci_bus.lock().unwrap(); + if let Some(d) = pci_bus.devices.get(&(device as u32)) { + let mut device = d.lock().unwrap(); + + // Find out if one of the device's BAR is being reprogrammed, and + // reprogram it if needed. + if let Some(params) = device.detect_bar_reprogramming(register, data) { + if let Err(e) = pci_bus.device_reloc.move_bar( + params.old_base, + params.new_base, + params.len, + device.deref_mut(), + params.region_type, + ) { + error!( + "Failed moving device BAR: {}: 0x{:x}->0x{:x}(0x{:x})", + e, params.old_base, params.new_base, params.len + ); + } + } + + // Update the register value + device.write_config_register(register, offset, data); + } + } +} + +impl BusDevice for PciConfigMmio { + fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { + // Only allow reads to the register boundary. + let start = offset as usize % 4; + let end = start + data.len(); + if end > 4 || offset > u64::from(u32::MAX) { + for d in data { + *d = 0xff; + } + return; + } + + let value = self.config_space_read(offset as u32); + for i in start..end { + data[i - start] = (value >> (i * 8)) as u8; + } + } + + fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { + if offset > u64::from(u32::MAX) { + return None; + } + self.config_space_write(offset as u32, offset % 4, data); + + None + } +} + +fn shift_and_mask(value: u32, offset: usize, mask: u32) -> usize { + ((value >> offset) & mask) as usize +} + +// Parse the MMIO address offset to a (bus, device, function, register) tuple. +// See section 7.2.2 PCI Express Enhanced Configuration Access Mechanism (ECAM) +// from the Pci Express Base Specification Revision 5.0 Version 1.0. +fn parse_mmio_config_address(config_address: u32) -> (usize, usize, usize, usize) { + const BUS_NUMBER_OFFSET: usize = 20; + const BUS_NUMBER_MASK: u32 = 0x00ff; + const DEVICE_NUMBER_OFFSET: usize = 15; + const DEVICE_NUMBER_MASK: u32 = 0x1f; + const FUNCTION_NUMBER_OFFSET: usize = 12; + const FUNCTION_NUMBER_MASK: u32 = 0x07; + const REGISTER_NUMBER_OFFSET: usize = 2; + const REGISTER_NUMBER_MASK: u32 = 0x3ff; + + ( + shift_and_mask(config_address, BUS_NUMBER_OFFSET, BUS_NUMBER_MASK), + shift_and_mask(config_address, DEVICE_NUMBER_OFFSET, DEVICE_NUMBER_MASK), + shift_and_mask(config_address, FUNCTION_NUMBER_OFFSET, FUNCTION_NUMBER_MASK), + shift_and_mask(config_address, REGISTER_NUMBER_OFFSET, REGISTER_NUMBER_MASK), + ) +} + +// Parse the CONFIG_ADDRESS register to a (bus, device, function, register) tuple. +fn parse_io_config_address(config_address: u32) -> (usize, usize, usize, usize) { + const BUS_NUMBER_OFFSET: usize = 16; + const BUS_NUMBER_MASK: u32 = 0x00ff; + const DEVICE_NUMBER_OFFSET: usize = 11; + const DEVICE_NUMBER_MASK: u32 = 0x1f; + const FUNCTION_NUMBER_OFFSET: usize = 8; + const FUNCTION_NUMBER_MASK: u32 = 0x07; + const REGISTER_NUMBER_OFFSET: usize = 2; + const REGISTER_NUMBER_MASK: u32 = 0x3f; + + ( + shift_and_mask(config_address, BUS_NUMBER_OFFSET, BUS_NUMBER_MASK), + shift_and_mask(config_address, DEVICE_NUMBER_OFFSET, DEVICE_NUMBER_MASK), + shift_and_mask(config_address, FUNCTION_NUMBER_OFFSET, FUNCTION_NUMBER_MASK), + shift_and_mask(config_address, REGISTER_NUMBER_OFFSET, REGISTER_NUMBER_MASK), + ) +} + +#[cfg(test)] +mod tests { + use std::sync::atomic::AtomicUsize; + use std::sync::{Arc, Mutex}; + + use vm_device::BusDevice; + + use super::{PciBus, PciConfigIo, PciConfigMmio, PciRoot}; + use crate::bus::{DEVICE_ID_INTEL_VIRT_PCIE_HOST, VENDOR_ID_INTEL}; + use crate::{ + DeviceRelocation, PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciClassCode, + PciConfiguration, PciDevice, PciHeaderType, PciMassStorageSubclass, + }; + + #[derive(Debug, Default)] + struct RelocationMock { + reloc_cnt: AtomicUsize, + } + + impl RelocationMock { + fn cnt(&self) -> usize { + self.reloc_cnt.load(std::sync::atomic::Ordering::SeqCst) + } + } + + impl DeviceRelocation for RelocationMock { + fn move_bar( + &self, + _old_base: u64, + _new_base: u64, + _len: u64, + _pci_dev: &mut dyn crate::PciDevice, + _region_type: crate::PciBarRegionType, + ) -> std::result::Result<(), std::io::Error> { + self.reloc_cnt + .fetch_add(1, std::sync::atomic::Ordering::SeqCst); + Ok(()) + } + } + + struct PciDevMock(PciConfiguration); + + impl PciDevMock { + fn new() -> Self { + let mut config = PciConfiguration::new( + 0x42, + 0x0, + 0x0, + PciClassCode::MassStorage, + &PciMassStorageSubclass::SerialScsiController, + None, + PciHeaderType::Device, + 0x13, + 0x12, + None, + None, + ); + + config + .add_pci_bar(&PciBarConfiguration { + addr: 0x1000, + size: 0x1000, + idx: 0, + region_type: PciBarRegionType::Memory32BitRegion, + prefetchable: PciBarPrefetchable::Prefetchable, + }) + .unwrap(); + + PciDevMock(config) + } + } + + impl PciDevice for PciDevMock { + fn write_config_register( + &mut self, + reg_idx: usize, + offset: u64, + data: &[u8], + ) -> Option> { + self.0.write_config_register(reg_idx, offset, data); + None + } + + fn read_config_register(&mut self, reg_idx: usize) -> u32 { + self.0.read_reg(reg_idx) + } + + fn detect_bar_reprogramming( + &mut self, + reg_idx: usize, + data: &[u8], + ) -> Option { + self.0.detect_bar_reprogramming(reg_idx, data) + } + } + + #[test] + fn test_writing_io_config_address() { + let mock = Arc::new(RelocationMock::default()); + let root = PciRoot::new(None); + let mut bus = PciConfigIo::new(Arc::new(Mutex::new(PciBus::new(root, mock)))); + + assert_eq!(bus.config_address, 0); + // Writing more than 32 bits will should fail + bus.write(0, 0, &[0x42; 8]); + assert_eq!(bus.config_address, 0); + // Write all the address at once + bus.write(0, 0, &[0x13, 0x12, 0x11, 0x10]); + assert_eq!(bus.config_address, 0x10111213); + // Not writing 32bits at offset 0 should have no effect + bus.write(0, 1, &[0x0; 4]); + assert_eq!(bus.config_address, 0x10111213); + + // Write two bytes at a time + bus.write(0, 0, &[0x42, 0x42]); + assert_eq!(bus.config_address, 0x10114242); + bus.write(0, 1, &[0x43, 0x43]); + assert_eq!(bus.config_address, 0x10434342); + bus.write(0, 2, &[0x44, 0x44]); + assert_eq!(bus.config_address, 0x44444342); + // Writing two bytes at offset 3 should overflow, so it shouldn't have any effect + bus.write(0, 3, &[0x45, 0x45]); + assert_eq!(bus.config_address, 0x44444342); + + // Write one byte at a time + bus.write(0, 0, &[0x0]); + assert_eq!(bus.config_address, 0x44444300); + bus.write(0, 1, &[0x0]); + assert_eq!(bus.config_address, 0x44440000); + bus.write(0, 2, &[0x0]); + assert_eq!(bus.config_address, 0x44000000); + bus.write(0, 3, &[0x0]); + assert_eq!(bus.config_address, 0x00000000); + // Writing past 4 bytes should have no effect + bus.write(0, 4, &[0x13]); + assert_eq!(bus.config_address, 0x0); + } + + #[test] + fn test_reading_io_config_address() { + let mock = Arc::new(RelocationMock::default()); + let root = PciRoot::new(None); + let mut bus = PciConfigIo::new(Arc::new(Mutex::new(PciBus::new(root, mock)))); + + let mut buffer = [0u8; 4]; + + bus.config_address = 0x13121110; + + // First 4 bytes are the config address + // Next 4 bytes are the values read from the configuration space. + // + // Reading past offset 7 should not return nothing (all 1s) + bus.read(0, 8, &mut buffer); + assert_eq!(buffer, [0xff; 4]); + + // offset + buffer.len() needs to be smaller or equal than 4 + bus.read(0, 1, &mut buffer); + assert_eq!(buffer, [0xff; 4]); + bus.read(0, 2, &mut buffer[..3]); + assert_eq!(buffer, [0xff; 4]); + bus.read(0, 3, &mut buffer[..2]); + assert_eq!(buffer, [0xff; 4]); + + // reading one byte at a time + bus.read(0, 0, &mut buffer[0..1]); + assert_eq!(buffer, [0x10, 0xff, 0xff, 0xff]); + bus.read(0, 1, &mut buffer[1..2]); + assert_eq!(buffer, [0x10, 0x11, 0xff, 0xff]); + bus.read(0, 2, &mut buffer[2..3]); + assert_eq!(buffer, [0x10, 0x11, 0x12, 0xff]); + bus.read(0, 3, &mut buffer[3..4]); + assert_eq!(buffer, [0x10, 0x11, 0x12, 0x13]); + + // reading two bytes at a time + bus.config_address = 0x42434445; + bus.read(0, 0, &mut buffer[..2]); + assert_eq!(buffer, [0x45, 0x44, 0x12, 0x13]); + bus.read(0, 1, &mut buffer[..2]); + assert_eq!(buffer, [0x44, 0x43, 0x12, 0x13]); + bus.read(0, 2, &mut buffer[..2]); + assert_eq!(buffer, [0x43, 0x42, 0x12, 0x13]); + + // reading all of it at once + bus.read(0, 0, &mut buffer); + assert_eq!(buffer, [0x45, 0x44, 0x43, 0x42]); + } + + fn initialize_bus() -> (PciConfigMmio, PciConfigIo, Arc) { + let mock = Arc::new(RelocationMock::default()); + let root = PciRoot::new(None); + let mut bus = PciBus::new(root, mock.clone()); + bus.add_device(1, Arc::new(Mutex::new(PciDevMock::new()))) + .unwrap(); + let bus = Arc::new(Mutex::new(bus)); + (PciConfigMmio::new(bus.clone()), PciConfigIo::new(bus), mock) + } + + #[test] + fn test_invalid_register_boundary_reads() { + let (mut mmio_config, mut io_config, _) = initialize_bus(); + + // Read crossing register boundaries + let mut buffer = [0u8; 4]; + mmio_config.read(0, 1, &mut buffer); + assert_eq!(0xffff_ffff, u32::from_le_bytes(buffer)); + + let mut buffer = [0u8; 4]; + io_config.read(0, 1, &mut buffer); + assert_eq!(0xffff_ffff, u32::from_le_bytes(buffer)); + + // As well in the config space + let mut buffer = [0u8; 4]; + io_config.read(0, 5, &mut buffer); + assert_eq!(0xffff_ffff, u32::from_le_bytes(buffer)); + } + + // MMIO config addresses are of the form + // + // | Base address upper bits | Bus Number | Device Number | Function Number | Register number | Byte offset | + // | 31-28 | 27-20 | 19-15 | 14-12 | 11-2 | 0-1 | + // + // Meaning that the offset is built using: + // + // `bus << 20 | device << 15 | function << 12 | register << 2 | byte` + fn mmio_offset(bus: u8, device: u8, function: u8, register: u16, byte: u8) -> u32 { + assert!(device < 32); + assert!(function < 8); + assert!(register < 1024); + assert!(byte < 4); + + (bus as u32) << 20 + | (device as u32) << 15 + | (function as u32) << 12 + | (register as u32) << 2 + | (byte as u32) + } + + fn read_mmio_config( + config: &mut PciConfigMmio, + bus: u8, + device: u8, + function: u8, + register: u16, + byte: u8, + data: &mut [u8], + ) { + config.read( + 0, + mmio_offset(bus, device, function, register, byte) as u64, + data, + ); + } + + fn write_mmio_config( + config: &mut PciConfigMmio, + bus: u8, + device: u8, + function: u8, + register: u16, + byte: u8, + data: &[u8], + ) { + config.write( + 0, + mmio_offset(bus, device, function, register, byte) as u64, + data, + ); + } + + // Similarly, when using the IO mechanism the config addresses have the following format + // + // | Enabled | zeros | Bus Number | Device Number | Function Number | Register number | zeros | + // | 31 | 30-24 | 23-16 | 15-11 | 10-8 | 7-2 | 1-0 | + // + // + // Meaning that the address is built using: + // + // 0x8000_0000 | bus << 16 | device << 11 | function << 8 | register << 2; + // + // Only 32-bit aligned accesses are allowed here. + fn pio_offset(enabled: bool, bus: u8, device: u8, function: u8, register: u8) -> u32 { + assert!(device < 32); + assert!(function < 8); + assert!(register < 64); + + let offset = if enabled { 0x8000_0000 } else { 0u32 }; + + offset + | (bus as u32) << 16 + | (device as u32) << 11 + | (function as u32) << 8 + | (register as u32) << 2 + } + + fn set_io_address( + config: &mut PciConfigIo, + enabled: bool, + bus: u8, + device: u8, + function: u8, + register: u8, + ) { + let address = u32::to_le_bytes(pio_offset(enabled, bus, device, function, register)); + config.write(0, 0, &address); + } + + fn read_io_config( + config: &mut PciConfigIo, + enabled: bool, + bus: u8, + device: u8, + function: u8, + register: u8, + data: &mut [u8], + ) { + set_io_address(config, enabled, bus, device, function, register); + config.read(0, 4, data); + } + + fn write_io_config( + config: &mut PciConfigIo, + enabled: bool, + bus: u8, + device: u8, + function: u8, + register: u8, + data: &[u8], + ) { + set_io_address(config, enabled, bus, device, function, register); + config.write(0, 4, data); + } + + #[test] + fn test_mmio_invalid_bus_number() { + let (mut mmio_config, _, _) = initialize_bus(); + let mut buffer = [0u8; 4]; + + // Asking for Bus 1 should return all 1s + read_mmio_config(&mut mmio_config, 1, 0, 0, 0, 0, &mut buffer); + assert_eq!(buffer, u32::to_le_bytes(0xffff_ffff)); + // Writing the same + buffer[0] = 0x42; + write_mmio_config(&mut mmio_config, 1, 0, 0, 15, 0, &buffer); + read_mmio_config(&mut mmio_config, 1, 0, 0, 15, 0, &mut buffer); + assert_eq!(buffer, u32::to_le_bytes(0xffff_ffff)); + read_mmio_config(&mut mmio_config, 0, 0, 0, 15, 0, &mut buffer); + assert_eq!(buffer, u32::to_le_bytes(0x0)); + + // Asking for Bus 0 should work + read_mmio_config(&mut mmio_config, 0, 0, 0, 0, 0, &mut buffer); + assert_eq!(&buffer[..2], &u16::to_le_bytes(VENDOR_ID_INTEL)); + assert_eq!( + &buffer[2..], + &u16::to_le_bytes(DEVICE_ID_INTEL_VIRT_PCIE_HOST) + ); + } + + #[test] + fn test_io_invalid_bus_number() { + let (_, mut pio_config, _) = initialize_bus(); + let mut buffer = [0u8; 4]; + + // Asking for Bus 1 should return all 1s + read_io_config(&mut pio_config, true, 1, 0, 0, 0, &mut buffer); + assert_eq!(buffer, u32::to_le_bytes(0xffff_ffff)); + + // Asking for Bus 0 should work + read_io_config(&mut pio_config, true, 0, 0, 0, 0, &mut buffer); + assert_eq!(&buffer[..2], &u16::to_le_bytes(VENDOR_ID_INTEL)); + assert_eq!( + &buffer[2..], + &u16::to_le_bytes(DEVICE_ID_INTEL_VIRT_PCIE_HOST) + ); + } + + #[test] + fn test_mmio_invalid_function() { + let (mut mmio_config, _, _) = initialize_bus(); + let mut buffer = [0u8; 4]; + + // Asking for Bus 1 should return all 1s + read_mmio_config(&mut mmio_config, 0, 0, 1, 0, 0, &mut buffer); + assert_eq!(buffer, u32::to_le_bytes(0xffff_ffff)); + // Writing the same + buffer[0] = 0x42; + write_mmio_config(&mut mmio_config, 0, 0, 1, 15, 0, &buffer); + read_mmio_config(&mut mmio_config, 0, 0, 1, 15, 0, &mut buffer); + assert_eq!(buffer, u32::to_le_bytes(0xffff_ffff)); + read_mmio_config(&mut mmio_config, 0, 0, 0, 15, 0, &mut buffer); + assert_eq!(buffer, u32::to_le_bytes(0x0)); + + // Asking for Bus 0 should work + read_mmio_config(&mut mmio_config, 0, 0, 0, 0, 0, &mut buffer); + assert_eq!(&buffer[..2], &u16::to_le_bytes(VENDOR_ID_INTEL)); + assert_eq!( + &buffer[2..], + &u16::to_le_bytes(DEVICE_ID_INTEL_VIRT_PCIE_HOST) + ); + } + + #[test] + fn test_io_invalid_function() { + let (_, mut pio_config, _) = initialize_bus(); + let mut buffer = [0u8; 4]; + + // Asking for Bus 1 should return all 1s + read_io_config(&mut pio_config, true, 0, 0, 1, 0, &mut buffer); + assert_eq!(buffer, u32::to_le_bytes(0xffff_ffff)); + + // Asking for Bus 0 should work + read_io_config(&mut pio_config, true, 0, 0, 0, 0, &mut buffer); + assert_eq!(&buffer[..2], &u16::to_le_bytes(VENDOR_ID_INTEL)); + assert_eq!( + &buffer[2..], + &u16::to_le_bytes(DEVICE_ID_INTEL_VIRT_PCIE_HOST) + ); + } + + #[test] + fn test_io_disabled_reads() { + let (_, mut pio_config, _) = initialize_bus(); + let mut buffer = [0u8; 4]; + + // Trying to read without enabling should return all 1s + read_io_config(&mut pio_config, false, 0, 0, 0, 0, &mut buffer); + assert_eq!(buffer, u32::to_le_bytes(0xffff_ffff)); + + // Asking for Bus 0 should work + read_io_config(&mut pio_config, true, 0, 0, 0, 0, &mut buffer); + assert_eq!(&buffer[..2], &u16::to_le_bytes(VENDOR_ID_INTEL)); + assert_eq!( + &buffer[2..], + &u16::to_le_bytes(DEVICE_ID_INTEL_VIRT_PCIE_HOST) + ); + } + + #[test] + fn test_io_disabled_writes() { + let (_, mut pio_config, _) = initialize_bus(); + + // Try to write the IRQ line used for the root port. + let mut buffer = [0u8; 4]; + + // First read the current value (use `enabled` bit) + read_io_config(&mut pio_config, true, 0, 0, 0, 15, &mut buffer); + let irq_line = buffer[0]; + + // Write without setting the `enabled` bit. + buffer[0] = 0x42; + write_io_config(&mut pio_config, false, 0, 0, 0, 15, &buffer); + + // IRQ line shouldn't have changed + read_io_config(&mut pio_config, true, 0, 0, 0, 15, &mut buffer); + assert_eq!(buffer[0], irq_line); + + // Write with `enabled` bit set. + buffer[0] = 0x42; + write_io_config(&mut pio_config, true, 0, 0, 0, 15, &buffer); + + // IRQ line should change + read_io_config(&mut pio_config, true, 0, 0, 0, 15, &mut buffer); + assert_eq!(buffer[0], 0x42); + } + + #[test] + fn test_mmio_writes() { + let (mut mmio_config, _, _) = initialize_bus(); + let mut buffer = [0u8; 4]; + + read_mmio_config(&mut mmio_config, 0, 0, 0, 15, 0, &mut buffer); + assert_eq!(buffer[0], 0x0); + write_mmio_config(&mut mmio_config, 0, 0, 0, 15, 0, &[0x42]); + read_mmio_config(&mut mmio_config, 0, 0, 0, 15, 0, &mut buffer); + assert_eq!(buffer[0], 0x42); + } + + #[test] + fn test_bar_reprogramming() { + let (mut mmio_config, _, mock) = initialize_bus(); + let mut buffer = [0u8; 4]; + assert_eq!(mock.cnt(), 0); + + read_mmio_config(&mut mmio_config, 0, 1, 0, 0x4, 0, &mut buffer); + let old_addr = u32::from_le_bytes(buffer) & 0xffff_fff0; + assert_eq!(old_addr, 0x1000); + write_mmio_config( + &mut mmio_config, + 0, + 1, + 0, + 0x4, + 0, + &u32::to_le_bytes(0x1312_1110), + ); + + read_mmio_config(&mut mmio_config, 0, 1, 0, 0x4, 0, &mut buffer); + let new_addr = u32::from_le_bytes(buffer) & 0xffff_fff0; + assert_eq!(new_addr, 0x1312_1110); + assert_eq!(mock.cnt(), 1); + + // BAR1 should not be used, so reading its address should return all 0s + read_mmio_config(&mut mmio_config, 0, 1, 0, 0x5, 0, &mut buffer); + assert_eq!(buffer, [0x0, 0x0, 0x0, 0x0]); + + // and reprogramming shouldn't have any effect + write_mmio_config( + &mut mmio_config, + 0, + 1, + 0, + 0x5, + 0, + &u32::to_le_bytes(0x1312_1110), + ); + + read_mmio_config(&mut mmio_config, 0, 1, 0, 0x5, 0, &mut buffer); + assert_eq!(buffer, [0x0, 0x0, 0x0, 0x0]); + } +} diff --git a/src/pci/src/configuration.rs b/src/pci/src/configuration.rs new file mode 100644 index 00000000000..fd1e3958ec8 --- /dev/null +++ b/src/pci/src/configuration.rs @@ -0,0 +1,1593 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +use std::fmt::{self, Display}; +use std::sync::{Arc, Mutex}; + +use byteorder::{ByteOrder, LittleEndian}; +use serde::{Deserialize, Serialize}; + +use crate::device::BarReprogrammingParams; +use crate::MsixConfig; + +// The number of 32bit registers in the config space, 4096 bytes. +const NUM_CONFIGURATION_REGISTERS: usize = 1024; + +const STATUS_REG: usize = 1; +const STATUS_REG_CAPABILITIES_USED_MASK: u32 = 0x0010_0000; +const BAR0_REG: usize = 4; +const ROM_BAR_REG: usize = 12; +const BAR_IO_ADDR_MASK: u32 = 0xffff_fffc; +const BAR_MEM_ADDR_MASK: u32 = 0xffff_fff0; +const ROM_BAR_ADDR_MASK: u32 = 0xffff_f800; +const MSI_CAPABILITY_REGISTER_MASK: u32 = 0x0071_0000; +const MSIX_CAPABILITY_REGISTER_MASK: u32 = 0xc000_0000; +const NUM_BAR_REGS: usize = 6; +const CAPABILITY_LIST_HEAD_OFFSET: usize = 0x34; +const FIRST_CAPABILITY_OFFSET: usize = 0x40; +const CAPABILITY_MAX_OFFSET: usize = 192; + +pub const PCI_CONFIGURATION_ID: &str = "pci_configuration"; + +/// Represents the types of PCI headers allowed in the configuration registers. +#[derive(Copy, Clone)] +pub enum PciHeaderType { + Device, + Bridge, +} + +/// Classes of PCI nodes. +#[allow(dead_code)] +#[derive(Copy, Clone)] +pub enum PciClassCode { + TooOld, + MassStorage, + NetworkController, + DisplayController, + MultimediaController, + MemoryController, + BridgeDevice, + SimpleCommunicationController, + BaseSystemPeripheral, + InputDevice, + DockingStation, + Processor, + SerialBusController, + WirelessController, + IntelligentIoController, + EncryptionController, + DataAcquisitionSignalProcessing, + Other = 0xff, +} + +impl PciClassCode { + pub fn get_register_value(self) -> u8 { + self as u8 + } +} + +/// A PCI subclass. Each class in `PciClassCode` can specify a unique set of subclasses. This trait +/// is implemented by each subclass. It allows use of a trait object to generate configurations. +pub trait PciSubclass { + /// Convert this subclass to the value used in the PCI specification. + fn get_register_value(&self) -> u8; +} + +/// Subclasses of the MultimediaController class. +#[allow(dead_code)] +#[derive(Copy, Clone)] +pub enum PciMultimediaSubclass { + VideoController = 0x00, + AudioController = 0x01, + TelephonyDevice = 0x02, + AudioDevice = 0x03, + Other = 0x80, +} + +impl PciSubclass for PciMultimediaSubclass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +/// Subclasses of the BridgeDevice +#[allow(dead_code)] +#[derive(Copy, Clone)] +pub enum PciBridgeSubclass { + HostBridge = 0x00, + IsaBridge = 0x01, + EisaBridge = 0x02, + McaBridge = 0x03, + PciToPciBridge = 0x04, + PcmciaBridge = 0x05, + NuBusBridge = 0x06, + CardBusBridge = 0x07, + RacEwayBridge = 0x08, + PciToPciSemiTransparentBridge = 0x09, + InfiniBrandToPciHostBridge = 0x0a, + OtherBridgeDevice = 0x80, +} + +impl PciSubclass for PciBridgeSubclass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +/// Subclass of the SerialBus +#[allow(dead_code)] +#[derive(Copy, Clone)] +pub enum PciSerialBusSubClass { + Firewire = 0x00, + Accessbus = 0x01, + Ssa = 0x02, + Usb = 0x03, +} + +impl PciSubclass for PciSerialBusSubClass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +/// Mass Storage Sub Classes +#[allow(dead_code)] +#[derive(Copy, Clone)] +pub enum PciMassStorageSubclass { + ScsiStorage = 0x00, + IdeInterface = 0x01, + FloppyController = 0x02, + IpiController = 0x03, + RaidController = 0x04, + AtaController = 0x05, + SataController = 0x06, + SerialScsiController = 0x07, + NvmController = 0x08, + MassStorage = 0x80, +} + +impl PciSubclass for PciMassStorageSubclass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +/// Network Controller Sub Classes +#[allow(dead_code)] +#[derive(Copy, Clone)] +pub enum PciNetworkControllerSubclass { + EthernetController = 0x00, + TokenRingController = 0x01, + FddiController = 0x02, + AtmController = 0x03, + IsdnController = 0x04, + WorldFipController = 0x05, + PicmgController = 0x06, + InfinibandController = 0x07, + FabricController = 0x08, + NetworkController = 0x80, +} + +impl PciSubclass for PciNetworkControllerSubclass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +/// Trait to define a PCI class programming interface +/// +/// Each combination of `PciClassCode` and `PciSubclass` can specify a +/// set of register-level programming interfaces. +/// This trait is implemented by each programming interface. +/// It allows use of a trait object to generate configurations. +pub trait PciProgrammingInterface { + /// Convert this programming interface to the value used in the PCI specification. + fn get_register_value(&self) -> u8; +} + +/// Types of PCI capabilities. +#[derive(Debug, PartialEq, Eq, Copy, Clone)] +#[allow(dead_code)] +#[allow(non_camel_case_types)] +#[repr(u8)] +pub enum PciCapabilityId { + ListId = 0, + PowerManagement = 0x01, + AcceleratedGraphicsPort = 0x02, + VitalProductData = 0x03, + SlotIdentification = 0x04, + MessageSignalledInterrupts = 0x05, + CompactPciHotSwap = 0x06, + PciX = 0x07, + HyperTransport = 0x08, + VendorSpecific = 0x09, + Debugport = 0x0A, + CompactPciCentralResourceControl = 0x0B, + PciStandardHotPlugController = 0x0C, + BridgeSubsystemVendorDeviceId = 0x0D, + AgpTargetPciPcibridge = 0x0E, + SecureDevice = 0x0F, + PciExpress = 0x10, + MsiX = 0x11, + SataDataIndexConf = 0x12, + PciAdvancedFeatures = 0x13, + PciEnhancedAllocation = 0x14, +} + +impl From for PciCapabilityId { + fn from(c: u8) -> Self { + match c { + 0 => PciCapabilityId::ListId, + 0x01 => PciCapabilityId::PowerManagement, + 0x02 => PciCapabilityId::AcceleratedGraphicsPort, + 0x03 => PciCapabilityId::VitalProductData, + 0x04 => PciCapabilityId::SlotIdentification, + 0x05 => PciCapabilityId::MessageSignalledInterrupts, + 0x06 => PciCapabilityId::CompactPciHotSwap, + 0x07 => PciCapabilityId::PciX, + 0x08 => PciCapabilityId::HyperTransport, + 0x09 => PciCapabilityId::VendorSpecific, + 0x0A => PciCapabilityId::Debugport, + 0x0B => PciCapabilityId::CompactPciCentralResourceControl, + 0x0C => PciCapabilityId::PciStandardHotPlugController, + 0x0D => PciCapabilityId::BridgeSubsystemVendorDeviceId, + 0x0E => PciCapabilityId::AgpTargetPciPcibridge, + 0x0F => PciCapabilityId::SecureDevice, + 0x10 => PciCapabilityId::PciExpress, + 0x11 => PciCapabilityId::MsiX, + 0x12 => PciCapabilityId::SataDataIndexConf, + 0x13 => PciCapabilityId::PciAdvancedFeatures, + 0x14 => PciCapabilityId::PciEnhancedAllocation, + _ => PciCapabilityId::ListId, + } + } +} + +/// Types of PCI Express capabilities. +#[derive(PartialEq, Eq, Copy, Clone, Debug)] +#[allow(dead_code)] +#[repr(u16)] +pub enum PciExpressCapabilityId { + NullCapability = 0x0000, + AdvancedErrorReporting = 0x0001, + VirtualChannelMultiFunctionVirtualChannelNotPresent = 0x0002, + DeviceSerialNumber = 0x0003, + PowerBudgeting = 0x0004, + RootComplexLinkDeclaration = 0x0005, + RootComplexInternalLinkControl = 0x0006, + RootComplexEventCollectorEndpointAssociation = 0x0007, + MultiFunctionVirtualChannel = 0x0008, + VirtualChannelMultiFunctionVirtualChannelPresent = 0x0009, + RootComplexRegisterBlock = 0x000a, + VendorSpecificExtendedCapability = 0x000b, + ConfigurationAccessCorrelation = 0x000c, + AccessControlServices = 0x000d, + AlternativeRoutingIdentificationInterpretation = 0x000e, + AddressTranslationServices = 0x000f, + SingleRootIoVirtualization = 0x0010, + DeprecatedMultiRootIoVirtualization = 0x0011, + Multicast = 0x0012, + PageRequestInterface = 0x0013, + ReservedForAmd = 0x0014, + ResizeableBar = 0x0015, + DynamicPowerAllocation = 0x0016, + ThpRequester = 0x0017, + LatencyToleranceReporting = 0x0018, + SecondaryPciExpress = 0x0019, + ProtocolMultiplexing = 0x001a, + ProcessAddressSpaceId = 0x001b, + LnRequester = 0x001c, + DownstreamPortContainment = 0x001d, + L1PmSubstates = 0x001e, + PrecisionTimeMeasurement = 0x001f, + PciExpressOverMphy = 0x0020, + FRSQueueing = 0x0021, + ReadinessTimeReporting = 0x0022, + DesignatedVendorSpecificExtendedCapability = 0x0023, + VfResizeableBar = 0x0024, + DataLinkFeature = 0x0025, + PhysicalLayerSixteenGts = 0x0026, + LaneMarginingAtTheReceiver = 0x0027, + HierarchyId = 0x0028, + NativePcieEnclosureManagement = 0x0029, + PhysicalLayerThirtyTwoGts = 0x002a, + AlternateProtocol = 0x002b, + SystemFirmwareIntermediary = 0x002c, + ShadowFunctions = 0x002d, + DataObjectExchange = 0x002e, + Reserved = 0x002f, + ExtendedCapabilitiesAbsence = 0xffff, +} + +impl From for PciExpressCapabilityId { + fn from(c: u16) -> Self { + match c { + 0x0000 => PciExpressCapabilityId::NullCapability, + 0x0001 => PciExpressCapabilityId::AdvancedErrorReporting, + 0x0002 => PciExpressCapabilityId::VirtualChannelMultiFunctionVirtualChannelNotPresent, + 0x0003 => PciExpressCapabilityId::DeviceSerialNumber, + 0x0004 => PciExpressCapabilityId::PowerBudgeting, + 0x0005 => PciExpressCapabilityId::RootComplexLinkDeclaration, + 0x0006 => PciExpressCapabilityId::RootComplexInternalLinkControl, + 0x0007 => PciExpressCapabilityId::RootComplexEventCollectorEndpointAssociation, + 0x0008 => PciExpressCapabilityId::MultiFunctionVirtualChannel, + 0x0009 => PciExpressCapabilityId::VirtualChannelMultiFunctionVirtualChannelPresent, + 0x000a => PciExpressCapabilityId::RootComplexRegisterBlock, + 0x000b => PciExpressCapabilityId::VendorSpecificExtendedCapability, + 0x000c => PciExpressCapabilityId::ConfigurationAccessCorrelation, + 0x000d => PciExpressCapabilityId::AccessControlServices, + 0x000e => PciExpressCapabilityId::AlternativeRoutingIdentificationInterpretation, + 0x000f => PciExpressCapabilityId::AddressTranslationServices, + 0x0010 => PciExpressCapabilityId::SingleRootIoVirtualization, + 0x0011 => PciExpressCapabilityId::DeprecatedMultiRootIoVirtualization, + 0x0012 => PciExpressCapabilityId::Multicast, + 0x0013 => PciExpressCapabilityId::PageRequestInterface, + 0x0014 => PciExpressCapabilityId::ReservedForAmd, + 0x0015 => PciExpressCapabilityId::ResizeableBar, + 0x0016 => PciExpressCapabilityId::DynamicPowerAllocation, + 0x0017 => PciExpressCapabilityId::ThpRequester, + 0x0018 => PciExpressCapabilityId::LatencyToleranceReporting, + 0x0019 => PciExpressCapabilityId::SecondaryPciExpress, + 0x001a => PciExpressCapabilityId::ProtocolMultiplexing, + 0x001b => PciExpressCapabilityId::ProcessAddressSpaceId, + 0x001c => PciExpressCapabilityId::LnRequester, + 0x001d => PciExpressCapabilityId::DownstreamPortContainment, + 0x001e => PciExpressCapabilityId::L1PmSubstates, + 0x001f => PciExpressCapabilityId::PrecisionTimeMeasurement, + 0x0020 => PciExpressCapabilityId::PciExpressOverMphy, + 0x0021 => PciExpressCapabilityId::FRSQueueing, + 0x0022 => PciExpressCapabilityId::ReadinessTimeReporting, + 0x0023 => PciExpressCapabilityId::DesignatedVendorSpecificExtendedCapability, + 0x0024 => PciExpressCapabilityId::VfResizeableBar, + 0x0025 => PciExpressCapabilityId::DataLinkFeature, + 0x0026 => PciExpressCapabilityId::PhysicalLayerSixteenGts, + 0x0027 => PciExpressCapabilityId::LaneMarginingAtTheReceiver, + 0x0028 => PciExpressCapabilityId::HierarchyId, + 0x0029 => PciExpressCapabilityId::NativePcieEnclosureManagement, + 0x002a => PciExpressCapabilityId::PhysicalLayerThirtyTwoGts, + 0x002b => PciExpressCapabilityId::AlternateProtocol, + 0x002c => PciExpressCapabilityId::SystemFirmwareIntermediary, + 0x002d => PciExpressCapabilityId::ShadowFunctions, + 0x002e => PciExpressCapabilityId::DataObjectExchange, + 0xffff => PciExpressCapabilityId::ExtendedCapabilitiesAbsence, + _ => PciExpressCapabilityId::Reserved, + } + } +} + +/// A PCI capability list. Devices can optionally specify capabilities in their configuration space. +pub trait PciCapability { + fn bytes(&self) -> &[u8]; + fn id(&self) -> PciCapabilityId; +} + +fn encode_32_bits_bar_size(bar_size: u32) -> Option { + if bar_size > 0 { + return Some(!(bar_size - 1)); + } + None +} + +fn decode_32_bits_bar_size(bar_size: u32) -> Option { + if bar_size > 0 { + return Some(!bar_size + 1); + } + None +} + +fn encode_64_bits_bar_size(bar_size: u64) -> Option<(u32, u32)> { + if bar_size > 0 { + let result = !(bar_size - 1); + let result_hi = (result >> 32) as u32; + let result_lo = (result & 0xffff_ffff) as u32; + return Some((result_hi, result_lo)); + } + None +} + +fn decode_64_bits_bar_size(bar_size_hi: u32, bar_size_lo: u32) -> Option { + let bar_size: u64 = ((bar_size_hi as u64) << 32) | (bar_size_lo as u64); + if bar_size > 0 { + return Some(!bar_size + 1); + } + None +} + +#[derive(Debug, Default, Clone, Copy, Serialize, Deserialize)] +struct PciBar { + addr: u32, + size: u32, + used: bool, + r#type: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PciConfigurationState { + registers: Vec, + writable_bits: Vec, + bars: Vec, + rom_bar_addr: u32, + rom_bar_size: u32, + rom_bar_used: bool, + last_capability: Option<(usize, usize)>, + msix_cap_reg_idx: Option, +} + +/// Contains the configuration space of a PCI node. +/// +/// See the [specification](https://en.wikipedia.org/wiki/PCI_configuration_space). +/// The configuration space is accessed with DWORD reads and writes from the guest. +pub struct PciConfiguration { + registers: [u32; NUM_CONFIGURATION_REGISTERS], + writable_bits: [u32; NUM_CONFIGURATION_REGISTERS], // writable bits for each register. + bars: [PciBar; NUM_BAR_REGS], + rom_bar_addr: u32, + rom_bar_size: u32, + rom_bar_used: bool, + // Contains the byte offset and size of the last capability. + last_capability: Option<(usize, usize)>, + msix_cap_reg_idx: Option, + msix_config: Option>>, +} + +/// See pci_regs.h in kernel +#[derive(Copy, Clone, PartialEq, Eq, Serialize, Deserialize, Debug)] +pub enum PciBarRegionType { + Memory32BitRegion = 0, + IoRegion = 0x01, + Memory64BitRegion = 0x04, +} + +#[derive(Debug, Copy, Clone, Serialize, Deserialize)] +pub enum PciBarPrefetchable { + NotPrefetchable = 0, + Prefetchable = 0x08, +} + +impl From for bool { + fn from(val: PciBarPrefetchable) -> Self { + match val { + PciBarPrefetchable::NotPrefetchable => false, + PciBarPrefetchable::Prefetchable => true, + } + } +} + +#[derive(Debug, Copy, Clone, Serialize, Deserialize)] +pub struct PciBarConfiguration { + pub addr: u64, + pub size: u64, + pub idx: usize, + pub region_type: PciBarRegionType, + pub prefetchable: PciBarPrefetchable, +} + +#[derive(Debug)] +pub enum Error { + BarAddressInvalid(u64, u64), + BarInUse(usize), + BarInUse64(usize), + BarInvalid(usize), + BarInvalid64(usize), + BarSizeInvalid(u64), + CapabilitySpaceFull(usize), + Decode32BarSize, + Decode64BarSize, + Encode32BarSize, + Encode64BarSize, + RomBarAddressInvalid(u64, u64), + RomBarInUse(usize), + RomBarInvalid(usize), + RomBarSizeInvalid(u64), +} +pub type Result = std::result::Result; + +impl std::error::Error for Error {} + +impl Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + use self::Error::*; + match self { + BarAddressInvalid(a, s) => write!(f, "address {a} size {s} too big"), + BarInUse(b) => write!(f, "bar {b} already used"), + BarInUse64(b) => write!(f, "64bit bar {b} already used(requires two regs)"), + BarInvalid(b) => write!(f, "bar {} invalid, max {}", b, NUM_BAR_REGS - 1), + BarInvalid64(b) => write!( + f, + "64bitbar {} invalid, requires two regs, max {}", + b, + NUM_BAR_REGS - 1 + ), + BarSizeInvalid(s) => write!(f, "bar address {s} not a power of two"), + CapabilitySpaceFull(s) => write!(f, "capability of size {s} doesn't fit"), + Decode32BarSize => write!(f, "failed to decode 32 bits BAR size"), + Decode64BarSize => write!(f, "failed to decode 64 bits BAR size"), + Encode32BarSize => write!(f, "failed to encode 32 bits BAR size"), + Encode64BarSize => write!(f, "failed to encode 64 bits BAR size"), + RomBarAddressInvalid(a, s) => write!(f, "address {a} size {s} too big"), + RomBarInUse(b) => write!(f, "rom bar {b} already used"), + RomBarInvalid(b) => write!(f, "rom bar {} invalid, max {}", b, NUM_BAR_REGS - 1), + RomBarSizeInvalid(s) => write!(f, "rom bar address {s} not a power of two"), + } + } +} + +impl PciConfiguration { + #[allow(clippy::too_many_arguments)] + pub fn new( + vendor_id: u16, + device_id: u16, + revision_id: u8, + class_code: PciClassCode, + subclass: &dyn PciSubclass, + programming_interface: Option<&dyn PciProgrammingInterface>, + header_type: PciHeaderType, + subsystem_vendor_id: u16, + subsystem_id: u16, + msix_config: Option>>, + state: Option, + ) -> Self { + let ( + registers, + writable_bits, + bars, + rom_bar_addr, + rom_bar_size, + rom_bar_used, + last_capability, + msix_cap_reg_idx, + ) = if let Some(state) = state { + ( + state.registers.try_into().unwrap(), + state.writable_bits.try_into().unwrap(), + state.bars.try_into().unwrap(), + state.rom_bar_addr, + state.rom_bar_size, + state.rom_bar_used, + state.last_capability, + state.msix_cap_reg_idx, + ) + } else { + let mut registers = [0u32; NUM_CONFIGURATION_REGISTERS]; + let mut writable_bits = [0u32; NUM_CONFIGURATION_REGISTERS]; + registers[0] = (u32::from(device_id) << 16) | u32::from(vendor_id); + // TODO(dverkamp): Status should be write-1-to-clear + writable_bits[1] = 0x0000_ffff; // Status (r/o), command (r/w) + let pi = if let Some(pi) = programming_interface { + pi.get_register_value() + } else { + 0 + }; + registers[2] = (u32::from(class_code.get_register_value()) << 24) + | (u32::from(subclass.get_register_value()) << 16) + | (u32::from(pi) << 8) + | u32::from(revision_id); + writable_bits[3] = 0x0000_00ff; // Cacheline size (r/w) + match header_type { + PciHeaderType::Device => { + registers[3] = 0x0000_0000; // Header type 0 (device) + writable_bits[15] = 0x0000_00ff; // Interrupt line (r/w) + } + PciHeaderType::Bridge => { + registers[3] = 0x0001_0000; // Header type 1 (bridge) + writable_bits[9] = 0xfff0_fff0; // Memory base and limit + writable_bits[15] = 0xffff_00ff; // Bridge control (r/w), interrupt line (r/w) + } + }; + registers[11] = (u32::from(subsystem_id) << 16) | u32::from(subsystem_vendor_id); + + ( + registers, + writable_bits, + [PciBar::default(); NUM_BAR_REGS], + 0, + 0, + false, + None, + None, + ) + }; + + PciConfiguration { + registers, + writable_bits, + bars, + rom_bar_addr, + rom_bar_size, + rom_bar_used, + last_capability, + msix_cap_reg_idx, + msix_config, + } + } + + pub fn state(&self) -> PciConfigurationState { + PciConfigurationState { + registers: self.registers.to_vec(), + writable_bits: self.writable_bits.to_vec(), + bars: self.bars.to_vec(), + rom_bar_addr: self.rom_bar_addr, + rom_bar_size: self.rom_bar_size, + rom_bar_used: self.rom_bar_used, + last_capability: self.last_capability, + msix_cap_reg_idx: self.msix_cap_reg_idx, + } + } + + /// Reads a 32bit register from `reg_idx` in the register map. + pub fn read_reg(&self, reg_idx: usize) -> u32 { + *(self.registers.get(reg_idx).unwrap_or(&0xffff_ffff)) + } + + /// Writes a 32bit register to `reg_idx` in the register map. + pub fn write_reg(&mut self, reg_idx: usize, value: u32) { + let mut mask = self.writable_bits[reg_idx]; + + if (BAR0_REG..BAR0_REG + NUM_BAR_REGS).contains(®_idx) { + // Handle very specific case where the BAR is being written with + // all 1's to retrieve the BAR size during next BAR reading. + if value == 0xffff_ffff { + mask &= self.bars[reg_idx - 4].size; + } + } else if reg_idx == ROM_BAR_REG { + // Handle very specific case where the BAR is being written with + // all 1's on bits 31-11 to retrieve the BAR size during next BAR + // reading. + if value & ROM_BAR_ADDR_MASK == ROM_BAR_ADDR_MASK { + mask &= self.rom_bar_size; + } + } + + if let Some(r) = self.registers.get_mut(reg_idx) { + *r = (*r & !self.writable_bits[reg_idx]) | (value & mask); + } else { + warn!("bad PCI register write {}", reg_idx); + } + } + + /// Writes a 16bit word to `offset`. `offset` must be 16bit aligned. + pub fn write_word(&mut self, offset: usize, value: u16) { + let shift = match offset % 4 { + 0 => 0, + 2 => 16, + _ => { + warn!("bad PCI config write offset {}", offset); + return; + } + }; + let reg_idx = offset / 4; + + if let Some(r) = self.registers.get_mut(reg_idx) { + let writable_mask = self.writable_bits[reg_idx]; + let mask = (0xffffu32 << shift) & writable_mask; + let shifted_value = (u32::from(value) << shift) & writable_mask; + *r = *r & !mask | shifted_value; + } else { + warn!("bad PCI config write offset {}", offset); + } + } + + /// Writes a byte to `offset`. + pub fn write_byte(&mut self, offset: usize, value: u8) { + self.write_byte_internal(offset, value, true); + } + + /// Writes a byte to `offset`, optionally enforcing read-only bits. + fn write_byte_internal(&mut self, offset: usize, value: u8, apply_writable_mask: bool) { + let shift = (offset % 4) * 8; + let reg_idx = offset / 4; + + if let Some(r) = self.registers.get_mut(reg_idx) { + let writable_mask = if apply_writable_mask { + self.writable_bits[reg_idx] + } else { + 0xffff_ffff + }; + let mask = (0xffu32 << shift) & writable_mask; + let shifted_value = (u32::from(value) << shift) & writable_mask; + *r = *r & !mask | shifted_value; + } else { + warn!("bad PCI config write offset {}", offset); + } + } + + /// Adds a region specified by `config`. Configures the specified BAR(s) to + /// report this region and size to the guest kernel. Enforces a few constraints + /// (i.e, region size must be power of two, register not already used). + pub fn add_pci_bar(&mut self, config: &PciBarConfiguration) -> Result<()> { + let bar_idx = config.idx; + let reg_idx = BAR0_REG + bar_idx; + + if bar_idx >= NUM_BAR_REGS { + return Err(Error::BarInvalid(bar_idx)); + } + + if self.bars[bar_idx].used { + return Err(Error::BarInUse(bar_idx)); + } + + if !config.size.is_power_of_two() { + return Err(Error::BarSizeInvalid(config.size)); + } + + let end_addr = config + .addr + .checked_add(config.size - 1) + .ok_or(Error::BarAddressInvalid(config.addr, config.size))?; + match config.region_type { + PciBarRegionType::Memory32BitRegion | PciBarRegionType::IoRegion => { + if end_addr > u64::from(u32::MAX) { + return Err(Error::BarAddressInvalid(config.addr, config.size)); + } + + // Encode the BAR size as expected by the software running in + // the guest. + self.bars[bar_idx].size = + encode_32_bits_bar_size(config.size as u32).ok_or(Error::Encode32BarSize)?; + } + PciBarRegionType::Memory64BitRegion => { + if bar_idx + 1 >= NUM_BAR_REGS { + return Err(Error::BarInvalid64(bar_idx)); + } + + if self.bars[bar_idx + 1].used { + return Err(Error::BarInUse64(bar_idx + 1)); + } + + // Encode the BAR size as expected by the software running in + // the guest. + let (bar_size_hi, bar_size_lo) = + encode_64_bits_bar_size(config.size).ok_or(Error::Encode64BarSize)?; + + self.registers[reg_idx + 1] = (config.addr >> 32) as u32; + self.writable_bits[reg_idx + 1] = 0xffff_ffff; + self.bars[bar_idx + 1].addr = self.registers[reg_idx + 1]; + self.bars[bar_idx].size = bar_size_lo; + self.bars[bar_idx + 1].size = bar_size_hi; + self.bars[bar_idx + 1].used = true; + } + } + + let (mask, lower_bits) = match config.region_type { + PciBarRegionType::Memory32BitRegion | PciBarRegionType::Memory64BitRegion => ( + BAR_MEM_ADDR_MASK, + config.prefetchable as u32 | config.region_type as u32, + ), + PciBarRegionType::IoRegion => (BAR_IO_ADDR_MASK, config.region_type as u32), + }; + + self.registers[reg_idx] = ((config.addr as u32) & mask) | lower_bits; + self.writable_bits[reg_idx] = mask; + self.bars[bar_idx].addr = self.registers[reg_idx]; + self.bars[bar_idx].used = true; + self.bars[bar_idx].r#type = Some(config.region_type); + + Ok(()) + } + + /// Returns the address of the given BAR region. + pub fn get_bar_addr(&self, bar_num: usize) -> u64 { + let bar_idx = BAR0_REG + bar_num; + + let mut addr = u64::from(self.bars[bar_num].addr & self.writable_bits[bar_idx]); + + if let Some(bar_type) = self.bars[bar_num].r#type { + if bar_type == PciBarRegionType::Memory64BitRegion { + addr |= u64::from(self.bars[bar_num + 1].addr) << 32; + } + } + + addr + } + + /// Adds the capability `cap_data` to the list of capabilities. + /// + /// `cap_data` should not include the two-byte PCI capability header (type, next). + /// Correct values will be generated automatically based on `cap_data.id()` and + /// `cap_data.len()`. + pub fn add_capability(&mut self, cap_data: &dyn PciCapability) -> Result { + let total_len = cap_data.bytes().len() + 2; + let (cap_offset, tail_offset) = match self.last_capability { + Some((offset, len)) => (Self::next_dword(offset, len), offset + 1), + None => (FIRST_CAPABILITY_OFFSET, CAPABILITY_LIST_HEAD_OFFSET), + }; + let end_offset = cap_offset + .checked_add(total_len) + .ok_or(Error::CapabilitySpaceFull(total_len))?; + if end_offset > CAPABILITY_MAX_OFFSET { + return Err(Error::CapabilitySpaceFull(total_len)); + } + self.registers[STATUS_REG] |= STATUS_REG_CAPABILITIES_USED_MASK; + self.write_byte_internal(tail_offset, cap_offset as u8, false); + self.write_byte_internal(cap_offset, cap_data.id() as u8, false); + self.write_byte_internal(cap_offset + 1, 0, false); // Next pointer. + for (i, byte) in cap_data.bytes().iter().enumerate() { + self.write_byte_internal(cap_offset + i + 2, *byte, false); + } + self.last_capability = Some((cap_offset, total_len)); + + match cap_data.id() { + PciCapabilityId::MessageSignalledInterrupts => { + self.writable_bits[cap_offset / 4] = MSI_CAPABILITY_REGISTER_MASK; + } + PciCapabilityId::MsiX => { + self.msix_cap_reg_idx = Some(cap_offset / 4); + self.writable_bits[self.msix_cap_reg_idx.unwrap()] = MSIX_CAPABILITY_REGISTER_MASK; + } + _ => {} + } + + Ok(cap_offset) + } + + // Find the next aligned offset after the one given. + fn next_dword(offset: usize, len: usize) -> usize { + let next = offset + len; + (next + 3) & !3 + } + + pub fn write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8]) { + if reg_idx >= NUM_CONFIGURATION_REGISTERS { + return; + } + + if offset as usize + data.len() > 4 { + return; + } + + // Handle potential write to MSI-X message control register + if let Some(msix_cap_reg_idx) = self.msix_cap_reg_idx { + if let Some(msix_config) = &self.msix_config { + if msix_cap_reg_idx == reg_idx && offset == 2 && data.len() == 2 { + // 2-bytes write in the Message Control field + msix_config + .lock() + .unwrap() + .set_msg_ctl(LittleEndian::read_u16(data)); + } else if msix_cap_reg_idx == reg_idx && offset == 0 && data.len() == 4 { + // 4 bytes write at the beginning. Ignore the first 2 bytes which are the + // capability id and next capability pointer + msix_config + .lock() + .unwrap() + .set_msg_ctl((LittleEndian::read_u32(data) >> 16) as u16); + } + } + } + + match data.len() { + 1 => self.write_byte(reg_idx * 4 + offset as usize, data[0]), + 2 => self.write_word( + reg_idx * 4 + offset as usize, + u16::from(data[0]) | (u16::from(data[1]) << 8), + ), + 4 => self.write_reg(reg_idx, LittleEndian::read_u32(data)), + _ => (), + } + } + + pub fn detect_bar_reprogramming( + &mut self, + reg_idx: usize, + data: &[u8], + ) -> Option { + if data.len() != 4 { + return None; + } + + let value = LittleEndian::read_u32(data); + + let mask = self.writable_bits[reg_idx]; + if (BAR0_REG..BAR0_REG + NUM_BAR_REGS).contains(®_idx) { + // Ignore the case where the BAR size is being asked for. + if value == 0xffff_ffff { + return None; + } + + let bar_idx = reg_idx - 4; + // Handle special case where the address being written is + // different from the address initially provided. This is a + // BAR reprogramming case which needs to be properly caught. + if let Some(bar_type) = self.bars[bar_idx].r#type { + // In case of 64 bits memory BAR, we don't do anything until + // the upper BAR is modified, otherwise we would be moving the + // BAR to a wrong location in memory. + if bar_type == PciBarRegionType::Memory64BitRegion { + return None; + } + + // Ignore the case where the value is unchanged. + if (value & mask) == (self.bars[bar_idx].addr & mask) { + return None; + } + + info!( + "Detected BAR reprogramming: (BAR {}) 0x{:x}->0x{:x}", + reg_idx, self.registers[reg_idx], value + ); + let old_base = u64::from(self.bars[bar_idx].addr & mask); + let new_base = u64::from(value & mask); + let len = u64::from( + decode_32_bits_bar_size(self.bars[bar_idx].size) + .ok_or(Error::Decode32BarSize) + .unwrap(), + ); + let region_type = bar_type; + + self.bars[bar_idx].addr = value; + + return Some(BarReprogrammingParams { + old_base, + new_base, + len, + region_type, + }); + } else if (reg_idx > BAR0_REG) + && ( + // The lower BAR (of this 64bit BAR) has been reprogrammed to a different value + // than it used to be + (self.registers[reg_idx - 1] & self.writable_bits[reg_idx - 1]) + != (self.bars[bar_idx - 1].addr & self.writable_bits[reg_idx - 1]) || + // Or the lower BAR hasn't been changed but the upper one is being reprogrammed + // now to a different value + (value & mask) != (self.bars[bar_idx].addr & mask) + ) + { + info!( + "Detected BAR reprogramming: (BAR {}) 0x{:x}->0x{:x}", + reg_idx, self.registers[reg_idx], value + ); + let old_base = (u64::from(self.bars[bar_idx].addr & mask) << 32) + | u64::from(self.bars[bar_idx - 1].addr & self.writable_bits[reg_idx - 1]); + let new_base = (u64::from(value & mask) << 32) + | u64::from(self.registers[reg_idx - 1] & self.writable_bits[reg_idx - 1]); + let len = + decode_64_bits_bar_size(self.bars[bar_idx].size, self.bars[bar_idx - 1].size) + .ok_or(Error::Decode64BarSize) + .unwrap(); + let region_type = PciBarRegionType::Memory64BitRegion; + + self.bars[bar_idx].addr = value; + self.bars[bar_idx - 1].addr = self.registers[reg_idx - 1]; + + return Some(BarReprogrammingParams { + old_base, + new_base, + len, + region_type, + }); + } + } else if reg_idx == ROM_BAR_REG && (value & mask) != (self.rom_bar_addr & mask) { + // Ignore the case where the BAR size is being asked for. + if value & ROM_BAR_ADDR_MASK == ROM_BAR_ADDR_MASK { + return None; + } + + info!( + "Detected ROM BAR reprogramming: (BAR {}) 0x{:x}->0x{:x}", + reg_idx, self.registers[reg_idx], value + ); + let old_base = u64::from(self.rom_bar_addr & mask); + let new_base = u64::from(value & mask); + let len = u64::from( + decode_32_bits_bar_size(self.rom_bar_size) + .ok_or(Error::Decode32BarSize) + .unwrap(), + ); + let region_type = PciBarRegionType::Memory32BitRegion; + + self.rom_bar_addr = value; + + return Some(BarReprogrammingParams { + old_base, + new_base, + len, + region_type, + }); + } + + None + } +} + +impl Default for PciBarConfiguration { + fn default() -> Self { + PciBarConfiguration { + idx: 0, + addr: 0, + size: 0, + region_type: PciBarRegionType::Memory64BitRegion, + prefetchable: PciBarPrefetchable::NotPrefetchable, + } + } +} + +#[cfg(test)] +mod tests { + + use vm_memory::ByteValued; + + use super::*; + use crate::MsixCap; + + #[repr(C, packed)] + #[derive(Clone, Copy, Default)] + #[allow(dead_code)] + struct TestCap { + len: u8, + foo: u8, + } + + // SAFETY: All members are simple numbers and any value is valid. + unsafe impl ByteValued for TestCap {} + + impl PciCapability for TestCap { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::VendorSpecific + } + } + + struct BadCap { + data: Vec, + } + + impl BadCap { + fn new(len: u8) -> Self { + Self { + data: (0..len).collect(), + } + } + } + + impl PciCapability for BadCap { + fn bytes(&self) -> &[u8] { + &self.data + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::VendorSpecific + } + } + + #[test] + fn add_capability() { + let mut cfg = PciConfiguration::new( + 0x1234, + 0x5678, + 0x1, + PciClassCode::MultimediaController, + &PciMultimediaSubclass::AudioController, + None, + PciHeaderType::Device, + 0xABCD, + 0x2468, + None, + None, + ); + + // Bad size capabilities + assert!(matches!( + cfg.add_capability(&BadCap::new(127)), + Err(Error::CapabilitySpaceFull(129)) + )); + cfg.add_capability(&BadCap::new(62)).unwrap(); + cfg.add_capability(&BadCap::new(62)).unwrap(); + assert!(matches!( + cfg.add_capability(&BadCap::new(0)), + Err(Error::CapabilitySpaceFull(2)) + )); + // Reset capabilities + cfg.last_capability = None; + + // Add two capabilities with different contents. + let cap1 = TestCap { len: 4, foo: 0xAA }; + let cap1_offset = cfg.add_capability(&cap1).unwrap(); + assert_eq!(cap1_offset % 4, 0); + + let cap2 = TestCap { + len: 0x04, + foo: 0x55, + }; + let cap2_offset = cfg.add_capability(&cap2).unwrap(); + assert_eq!(cap2_offset % 4, 0); + + // The capability list head should be pointing to cap1. + let cap_ptr = cfg.read_reg(CAPABILITY_LIST_HEAD_OFFSET / 4) & 0xFF; + assert_eq!(cap1_offset, cap_ptr as usize); + + // Verify the contents of the capabilities. + let cap1_data = cfg.read_reg(cap1_offset / 4); + assert_eq!(cap1_data & 0xFF, 0x09); // capability ID + assert_eq!((cap1_data >> 8) & 0xFF, cap2_offset as u32); // next capability pointer + assert_eq!((cap1_data >> 16) & 0xFF, 0x04); // cap1.len + assert_eq!((cap1_data >> 24) & 0xFF, 0xAA); // cap1.foo + + let cap2_data = cfg.read_reg(cap2_offset / 4); + assert_eq!(cap2_data & 0xFF, 0x09); // capability ID + assert_eq!((cap2_data >> 8) & 0xFF, 0x00); // next capability pointer + assert_eq!((cap2_data >> 16) & 0xFF, 0x04); // cap2.len + assert_eq!((cap2_data >> 24) & 0xFF, 0x55); // cap2.foo + } + + #[test] + fn test_msix_capability() { + let mut cfg = PciConfiguration::new( + 0x1234, + 0x5678, + 0x1, + PciClassCode::MultimediaController, + &PciMultimediaSubclass::AudioController, + None, + PciHeaderType::Device, + 0xABCD, + 0x2468, + None, + None, + ); + + // Information about the MSI-X capability layout: https://wiki.osdev.org/PCI#Enabling_MSI-X + let msix_cap = MsixCap::new( + 3, // Using BAR3 for message control table + 1024, // 1024 MSI-X vectors + 0x4000, // Offset of message control table inside the BAR + 4, // BAR4 used for pending control bit + 0x420, // Offset of pending bit array (PBA) inside BAR + ); + cfg.add_capability(&msix_cap).unwrap(); + + let cap_reg = FIRST_CAPABILITY_OFFSET / 4; + let reg = cfg.read_reg(cap_reg); + // Capability ID is MSI-X + assert_eq!( + PciCapabilityId::from((reg & 0xff) as u8), + PciCapabilityId::MsiX + ); + // We only have one capability, so `next` should be 0 + assert_eq!(((reg >> 8) & 0xff) as u8, 0); + let msg_ctl = (reg >> 16) as u16; + + // MSI-X is enabled + assert_eq!(msg_ctl & 0x8000, 0x8000); + // Vectors are not masked + assert_eq!(msg_ctl & 0x4000, 0x0); + // Reserved bits are 0 + assert_eq!(msg_ctl & 0x3800, 0x0); + // We've got 1024 vectors (Table size is N-1 encoded) + assert_eq!((msg_ctl & 0x7ff) + 1, 1024); + + let reg = cfg.read_reg(cap_reg + 1); + // We are using BAR3 + assert_eq!(reg & 0x7, 3); + // Message Control Table is located in offset 0x4000 inside the BAR + // We don't need to shift. Offset needs to be 8-byte aligned - so BIR + // is stored in its last 3 bits (which we need to mask out). + assert_eq!(reg & 0xffff_fff8, 0x4000); + + let reg = cfg.read_reg(cap_reg + 2); + // PBA is 0x420 bytes inside BAR4 + assert_eq!(reg & 0x7, 4); + assert_eq!(reg & 0xffff_fff8, 0x420); + + // Check read/write mask + // Capability Id of MSI-X is 0x11 + cfg.write_config_register(cap_reg, 0, &[0x0]); + assert_eq!( + PciCapabilityId::from((cfg.read_reg(cap_reg) & 0xff) as u8), + PciCapabilityId::MsiX + ); + // Cannot override next capability pointer + cfg.write_config_register(cap_reg, 1, &[0x42]); + assert_eq!((cfg.read_reg(cap_reg) >> 8) & 0xff, 0); + + // We are writing this: + // + // meaning: | MSI enabled | Vectors Masked | Reserved | Table size | + // bit: | 15 | 14 | 13 - 11 | 0 - 10 | + // R/W: | R/W | R/W | R | R | + let msg_ctl = (cfg.read_reg(cap_reg) >> 16) as u16; + // Try to flip all bits + cfg.write_config_register(cap_reg, 2, &u16::to_le_bytes(!msg_ctl)); + let msg_ctl = (cfg.read_reg(cap_reg) >> 16) as u16; + // MSI enabled and Vectors masked should be flipped (MSI disabled and vectors masked) + assert_eq!(msg_ctl & 0xc000, 0x4000); + // Reserved bits should still be 0 + assert_eq!(msg_ctl & 0x3800, 0); + // Table size should not have changed + assert_eq!((msg_ctl & 0x07ff) + 1, 1024); + + // Table offset is read only + let table_offset = cfg.read_reg(cap_reg + 1); + // Try to flip all bits + cfg.write_config_register(cap_reg + 1, 0, &u32::to_le_bytes(!table_offset)); + // None should be flipped + assert_eq!(cfg.read_reg(cap_reg + 1), table_offset); + + // PBA offset also + let pba_offset = cfg.read_reg(cap_reg + 2); + // Try to flip all bits + cfg.write_config_register(cap_reg + 2, 0, &u32::to_le_bytes(!pba_offset)); + // None should be flipped + assert_eq!(cfg.read_reg(cap_reg + 2), pba_offset); + } + + #[derive(Copy, Clone)] + enum TestPi { + Test = 0x5a, + } + + impl PciProgrammingInterface for TestPi { + fn get_register_value(&self) -> u8 { + *self as u8 + } + } + + #[test] + fn class_code() { + let cfg = PciConfiguration::new( + 0x1234, + 0x5678, + 0x1, + PciClassCode::MultimediaController, + &PciMultimediaSubclass::AudioController, + Some(&TestPi::Test), + PciHeaderType::Device, + 0xABCD, + 0x2468, + None, + None, + ); + + let class_reg = cfg.read_reg(2); + let class_code = (class_reg >> 24) & 0xFF; + let subclass = (class_reg >> 16) & 0xFF; + let prog_if = (class_reg >> 8) & 0xFF; + assert_eq!(class_code, 0x04); + assert_eq!(subclass, 0x01); + assert_eq!(prog_if, 0x5a); + } + + #[test] + fn test_bar_size_encoding() { + assert!(encode_32_bits_bar_size(0).is_none()); + assert!(decode_32_bits_bar_size(0).is_none()); + assert!(encode_64_bits_bar_size(0).is_none()); + assert!(decode_64_bits_bar_size(0, 0).is_none()); + + // According to OSDev wiki (https://wiki.osdev.org/PCI#Address_and_size_of_the_BAR): + // + // > To determine the amount of address space needed by a PCI device, you must save the + // > original value of the BAR, write a value of all 1's to the register, then read it back. + // > The amount of memory can then be determined by masking the information bits, performing + // > a bitwise NOT ('~' in C), and incrementing the value by 1. The original value of the + // BAR > should then be restored. The BAR register is naturally aligned and as such you can + // only > modify the bits that are set. For example, if a device utilizes 16 MB it will + // have BAR0 > filled with 0xFF000000 (0x1000000 after decoding) and you can only modify + // the upper > 8-bits. + // + // So we should be encoding an address like this: `addr` -> `!(addr - 1)` + let encoded = encode_32_bits_bar_size(0x0101_0101).unwrap(); + assert_eq!(encoded, 0xfefe_feff); + assert_eq!(decode_32_bits_bar_size(encoded), Some(0x0101_0101)); + + // Similarly we encode a 64 bits size and then store it as a 2 32bit addresses (we use + // two BARs). + let (hi, lo) = encode_64_bits_bar_size(0xffff_ffff_ffff_fff0).unwrap(); + assert_eq!(hi, 0); + assert_eq!(lo, 0x0000_0010); + assert_eq!(decode_64_bits_bar_size(hi, lo), Some(0xffff_ffff_ffff_fff0)); + } + + #[test] + fn test_add_pci_bar() { + let mut pci_config = PciConfiguration::new( + 0x42, + 0x0, + 0x0, + PciClassCode::MassStorage, + &PciMassStorageSubclass::SerialScsiController, + None, + PciHeaderType::Device, + 0x13, + 0x12, + None, + None, + ); + + // BAR size can only be a power of 2 + assert!(matches!( + pci_config.add_pci_bar(&PciBarConfiguration { + addr: 0x1000, + size: 0x1001, + idx: 0, + region_type: PciBarRegionType::Memory32BitRegion, + prefetchable: PciBarPrefetchable::Prefetchable, + }), + Err(Error::BarSizeInvalid(0x1001)) + )); + + // Invalid BAR index + assert!(matches!( + pci_config.add_pci_bar(&PciBarConfiguration { + addr: 0x1000, + size: 0x1000, + idx: NUM_BAR_REGS, + region_type: PciBarRegionType::Memory32BitRegion, + prefetchable: PciBarPrefetchable::Prefetchable + }), + Err(Error::BarInvalid(NUM_BAR_REGS)) + )); + // 64bit BARs need 2 BAR slots actually + assert!(matches!( + pci_config.add_pci_bar(&PciBarConfiguration { + addr: 0x1000, + size: 0x1000, + idx: NUM_BAR_REGS - 1, + region_type: PciBarRegionType::Memory64BitRegion, + prefetchable: PciBarPrefetchable::Prefetchable + }), + Err(Error::BarInvalid64(_)) + )); + + // Check for valid addresses + // Can't have an address that exceeds 32 bits for a 32bit BAR + assert!(matches!( + pci_config.add_pci_bar(&PciBarConfiguration { + addr: 0x1000_0000_0000_0000, + size: 0x1000, + idx: 0, + region_type: PciBarRegionType::Memory32BitRegion, + prefetchable: PciBarPrefetchable::Prefetchable + }), + Err(Error::BarAddressInvalid(0x1000_0000_0000_0000, 0x1000)) + )); + // Ensure that we handle properly overflows in 64bit BAR ranges + assert!(matches!( + pci_config.add_pci_bar(&PciBarConfiguration { + addr: u64::MAX, + size: 0x2, + idx: 0, + region_type: PciBarRegionType::Memory64BitRegion, + prefetchable: PciBarPrefetchable::Prefetchable + }), + Err(Error::BarAddressInvalid(u64::MAX, 2)) + )); + + // We can't reuse a BAR slot + pci_config + .add_pci_bar(&PciBarConfiguration { + addr: 0x1000, + size: 0x1000, + idx: 0, + region_type: PciBarRegionType::Memory32BitRegion, + prefetchable: PciBarPrefetchable::Prefetchable, + }) + .unwrap(); + assert!(matches!( + pci_config.add_pci_bar(&PciBarConfiguration { + addr: 0x1000, + size: 0x1000, + idx: 0, + region_type: PciBarRegionType::Memory32BitRegion, + prefetchable: PciBarPrefetchable::Prefetchable, + }), + Err(Error::BarInUse(0)) + )); + pci_config + .add_pci_bar(&PciBarConfiguration { + addr: 0x0000_0001_0000_0000, + size: 0x2000, + idx: 2, + region_type: PciBarRegionType::Memory64BitRegion, + prefetchable: PciBarPrefetchable::Prefetchable, + }) + .unwrap(); + // For 64bit BARs two BARs are used (in this case BARs 1 and 2) + assert!(matches!( + pci_config.add_pci_bar(&PciBarConfiguration { + addr: 0x0000_0001_0000_0000, + size: 0x1000, + idx: 2, + region_type: PciBarRegionType::Memory64BitRegion, + prefetchable: PciBarPrefetchable::Prefetchable, + }), + Err(Error::BarInUse(2)) + )); + assert!(matches!( + pci_config.add_pci_bar(&PciBarConfiguration { + addr: 0x0000_0001_0000_0000, + size: 0x1000, + idx: 1, + region_type: PciBarRegionType::Memory64BitRegion, + prefetchable: PciBarPrefetchable::Prefetchable, + }), + Err(Error::BarInUse64(2)) + )); + + assert_eq!(pci_config.get_bar_addr(0), 0x1000); + assert_eq!(pci_config.get_bar_addr(2), 0x1_0000_0000); + } + + #[test] + fn test_access_invalid_reg() { + let mut pci_config = PciConfiguration::new( + 0x42, + 0x0, + 0x0, + PciClassCode::MassStorage, + &PciMassStorageSubclass::SerialScsiController, + None, + PciHeaderType::Device, + 0x13, + 0x12, + None, + None, + ); + + // Can't read past the end of the configuration space + assert_eq!( + pci_config.read_reg(NUM_CONFIGURATION_REGISTERS), + 0xffff_ffff + ); + + // Read out all of configuration space + let config_space: Vec = (0..NUM_CONFIGURATION_REGISTERS) + .map(|reg_idx| pci_config.read_reg(reg_idx)) + .collect(); + + // Various invalid write accesses + + // Past the end of config space + pci_config.write_config_register(NUM_CONFIGURATION_REGISTERS, 0, &[0x42]); + pci_config.write_config_register(NUM_CONFIGURATION_REGISTERS, 0, &[0x42, 0x42]); + pci_config.write_config_register(NUM_CONFIGURATION_REGISTERS, 0, &[0x42, 0x42, 0x42, 0x42]); + + // Past register boundaries + pci_config.write_config_register(NUM_CONFIGURATION_REGISTERS, 1, &[0x42, 0x42, 0x42, 0x42]); + pci_config.write_config_register(NUM_CONFIGURATION_REGISTERS, 2, &[0x42, 0x42, 0x42]); + pci_config.write_config_register(NUM_CONFIGURATION_REGISTERS, 3, &[0x42, 0x42]); + pci_config.write_config_register(NUM_CONFIGURATION_REGISTERS, 4, &[0x42]); + pci_config.write_config_register(NUM_CONFIGURATION_REGISTERS, 5, &[]); + + for (reg_idx, reg) in config_space.iter().enumerate() { + assert_eq!(*reg, pci_config.read_reg(reg_idx)); + } + } + + #[test] + fn test_detect_bar_reprogramming() { + let mut pci_config = PciConfiguration::new( + 0x42, + 0x0, + 0x0, + PciClassCode::MassStorage, + &PciMassStorageSubclass::SerialScsiController, + None, + PciHeaderType::Device, + 0x13, + 0x12, + None, + None, + ); + + // Trying to reprogram with something less than 4 bytes (length of the address) should fail + assert!(pci_config + .detect_bar_reprogramming(BAR0_REG, &[0x13]) + .is_none()); + assert!(pci_config + .detect_bar_reprogramming(BAR0_REG, &[0x13, 0x12]) + .is_none()); + assert!(pci_config + .detect_bar_reprogramming(BAR0_REG, &[0x13, 0x12]) + .is_none()); + assert!(pci_config + .detect_bar_reprogramming(BAR0_REG, &[0x13, 0x12, 0x16]) + .is_none()); + + // Writing all 1s is a special case where we're actually asking for the size of the BAR + assert!(pci_config + .detect_bar_reprogramming(BAR0_REG, &u32::to_le_bytes(0xffff_ffff)) + .is_none()); + + // Trying to reprogram a BAR that hasn't be initialized does nothing + for reg_idx in BAR0_REG..BAR0_REG + NUM_BAR_REGS { + assert!(pci_config + .detect_bar_reprogramming(reg_idx, &u32::to_le_bytes(0x1312_4243)) + .is_none()); + } + + // Reprogramming of a 32bit BAR + pci_config + .add_pci_bar(&PciBarConfiguration { + addr: 0x1000, + size: 0x1000, + idx: 0, + region_type: PciBarRegionType::Memory32BitRegion, + prefetchable: PciBarPrefetchable::Prefetchable, + }) + .unwrap(); + + assert_eq!( + pci_config.detect_bar_reprogramming(BAR0_REG, &u32::to_le_bytes(0x2000)), + Some(BarReprogrammingParams { + old_base: 0x1000, + new_base: 0x2000, + len: 0x1000, + region_type: PciBarRegionType::Memory32BitRegion + }) + ); + + pci_config.write_config_register(BAR0_REG, 0, &u32::to_le_bytes(0x2000)); + assert_eq!(pci_config.read_reg(BAR0_REG) & 0xffff_fff0, 0x2000); + + // Attempting to reprogram the BAR with the same address should not have any effect + assert!(pci_config + .detect_bar_reprogramming(BAR0_REG, &u32::to_le_bytes(0x2000)) + .is_none()); + + // Reprogramming of a 64bit BAR + pci_config + .add_pci_bar(&PciBarConfiguration { + addr: 0x13_1200_0000, + size: 0x8000, + idx: 1, + region_type: PciBarRegionType::Memory64BitRegion, + prefetchable: PciBarPrefetchable::Prefetchable, + }) + .unwrap(); + + assert_eq!(pci_config.read_reg(BAR0_REG + 1) & 0xffff_fff0, 0x1200_0000); + assert_eq!( + pci_config.bars[1].r#type, + Some(PciBarRegionType::Memory64BitRegion) + ); + assert_eq!(pci_config.read_reg(BAR0_REG + 2), 0x13); + assert!(pci_config.bars[2].r#type.is_none()); + + // First we write the lower 32 bits and this shouldn't cause any reprogramming + assert!(pci_config + .detect_bar_reprogramming(BAR0_REG + 1, &u32::to_le_bytes(0x4200_0000)) + .is_none()); + pci_config.write_config_register(BAR0_REG + 1, 0, &u32::to_le_bytes(0x4200_0000)); + + // Writing the upper 32 bits should trigger the reprogramming + assert_eq!( + pci_config.detect_bar_reprogramming(BAR0_REG + 2, &u32::to_le_bytes(0x84)), + Some(BarReprogrammingParams { + old_base: 0x13_1200_0000, + new_base: 0x84_4200_0000, + len: 0x8000, + region_type: PciBarRegionType::Memory64BitRegion + }) + ); + pci_config.write_config_register(BAR0_REG + 2, 0, &u32::to_le_bytes(0x84)); + + // Trying to reprogram the upper bits directly (without first touching the lower bits) + // should trigger a reprogramming + assert_eq!( + pci_config.detect_bar_reprogramming(BAR0_REG + 2, &u32::to_le_bytes(0x1312)), + Some(BarReprogrammingParams { + old_base: 0x84_4200_0000, + new_base: 0x1312_4200_0000, + len: 0x8000, + region_type: PciBarRegionType::Memory64BitRegion + }) + ); + pci_config.write_config_register(BAR0_REG + 2, 0, &u32::to_le_bytes(0x1312)); + + // Attempting to reprogram the BAR with the same address should not have any effect + assert!(pci_config + .detect_bar_reprogramming(BAR0_REG + 1, &u32::to_le_bytes(0x4200_0000)) + .is_none()); + assert!(pci_config + .detect_bar_reprogramming(BAR0_REG + 2, &u32::to_le_bytes(0x1312)) + .is_none()); + } +} diff --git a/src/pci/src/device.rs b/src/pci/src/device.rs new file mode 100644 index 00000000000..11db4f478a5 --- /dev/null +++ b/src/pci/src/device.rs @@ -0,0 +1,105 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +use std::sync::{Arc, Barrier}; +use std::{io, result}; + +use vm_allocator::AddressAllocator; + +use crate::configuration::{self, PciBarRegionType}; + +#[derive(Debug, thiserror::Error, displaydoc::Display)] +pub enum Error { + /// Setup of the device capabilities failed: {0}. + CapabilitiesSetup(configuration::Error), + /// Allocating space for an IO BAR failed, size={0}. + IoAllocationFailed(u64), + /// Registering an IO BAR at address {0} failed: {1} + IoRegistrationFailed(u64, configuration::Error), + /// Expected resource not found. + MissingResource, +} +pub type Result = std::result::Result; + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub struct BarReprogrammingParams { + pub old_base: u64, + pub new_base: u64, + pub len: u64, + pub region_type: PciBarRegionType, +} + +pub trait PciDevice: Send { + /// Allocates the needed PCI BARs space using the `allocate` function which takes a size and + /// returns an address. Returns a Vec of (GuestAddress, GuestUsize) tuples. + fn allocate_bars( + &mut self, + _mmio32_allocator: &mut AddressAllocator, + _mmio64_allocator: &mut AddressAllocator, + ) -> Result<()> { + Ok(()) + } + + /// Frees the PCI BARs previously allocated with a call to allocate_bars(). + fn free_bars( + &mut self, + _mmio32_allocator: &mut AddressAllocator, + _mmio64_allocator: &mut AddressAllocator, + ) -> Result<()> { + Ok(()) + } + + /// Sets a register in the configuration space. + /// * `reg_idx` - The index of the config register to modify. + /// * `offset` - Offset into the register. + fn write_config_register( + &mut self, + reg_idx: usize, + offset: u64, + data: &[u8], + ) -> Option>; + /// Gets a register from the configuration space. + /// * `reg_idx` - The index of the config register to read. + fn read_config_register(&mut self, reg_idx: usize) -> u32; + /// Detects if a BAR is being reprogrammed. + fn detect_bar_reprogramming( + &mut self, + _reg_idx: usize, + _data: &[u8], + ) -> Option { + None + } + /// Reads from a BAR region mapped into the device. + /// * `addr` - The guest address inside the BAR. + /// * `data` - Filled with the data from `addr`. + fn read_bar(&mut self, _base: u64, _offset: u64, _data: &mut [u8]) {} + /// Writes to a BAR region mapped into the device. + /// * `addr` - The guest address inside the BAR. + /// * `data` - The data to write. + fn write_bar(&mut self, _base: u64, _offset: u64, _data: &[u8]) -> Option> { + None + } + /// Relocates the BAR to a different address in guest address space. + fn move_bar(&mut self, _old_base: u64, _new_base: u64) -> result::Result<(), io::Error> { + Ok(()) + } +} + +/// This trait defines a set of functions which can be triggered whenever a +/// PCI device is modified in any way. +pub trait DeviceRelocation: Send + Sync { + /// The BAR needs to be moved to a different location in the guest address + /// space. This follows a decision from the software running in the guest. + fn move_bar( + &self, + old_base: u64, + new_base: u64, + len: u64, + pci_dev: &mut dyn PciDevice, + region_type: PciBarRegionType, + ) -> result::Result<(), io::Error>; +} diff --git a/src/pci/src/lib.rs b/src/pci/src/lib.rs new file mode 100644 index 00000000000..83f5a7a5dcf --- /dev/null +++ b/src/pci/src/lib.rs @@ -0,0 +1,348 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +//! Implements pci devices and busses. +#[macro_use] +extern crate log; + +mod bus; +mod configuration; +mod device; +mod msix; + +use std::fmt::{self, Debug, Display}; +use std::num::ParseIntError; +use std::str::FromStr; + +use serde::de::Visitor; + +pub use self::bus::{PciBus, PciConfigIo, PciConfigMmio, PciRoot, PciRootError}; +pub use self::configuration::{ + PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciCapability, PciCapabilityId, + PciClassCode, PciConfiguration, PciConfigurationState, PciExpressCapabilityId, PciHeaderType, + PciMassStorageSubclass, PciNetworkControllerSubclass, PciProgrammingInterface, + PciSerialBusSubClass, PciSubclass, PCI_CONFIGURATION_ID, +}; +pub use self::device::{ + BarReprogrammingParams, DeviceRelocation, Error as PciDeviceError, PciDevice, +}; +pub use self::msix::{Error as MsixError, MsixCap, MsixConfig, MsixConfigState, MsixTableEntry}; + +/// PCI has four interrupt pins A->D. +#[derive(Copy, Clone)] +pub enum PciInterruptPin { + IntA, + IntB, + IntC, + IntD, +} + +impl PciInterruptPin { + pub fn to_mask(self) -> u32 { + self as u32 + } +} + +#[cfg(target_arch = "x86_64")] +pub const PCI_CONFIG_IO_PORT: u64 = 0xcf8; +#[cfg(target_arch = "x86_64")] +pub const PCI_CONFIG_IO_PORT_SIZE: u64 = 0x8; + +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd)] +pub struct PciBdf(u32); + +struct PciBdfVisitor; + +impl Visitor<'_> for PciBdfVisitor { + type Value = PciBdf; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + formatter.write_str("struct PciBdf") + } + + fn visit_str(self, v: &str) -> Result + where + E: serde::de::Error, + { + PciBdf::from_str(v).map_err(serde::de::Error::custom) + } +} + +impl<'de> serde::Deserialize<'de> for PciBdf { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + deserializer.deserialize_str(PciBdfVisitor) + } +} + +impl serde::Serialize for PciBdf { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + serializer.collect_str(&self.to_string()) + } +} + +impl PciBdf { + pub fn segment(&self) -> u16 { + ((self.0 >> 16) & 0xffff) as u16 + } + + pub fn bus(&self) -> u8 { + ((self.0 >> 8) & 0xff) as u8 + } + + pub fn device(&self) -> u8 { + ((self.0 >> 3) & 0x1f) as u8 + } + + pub fn function(&self) -> u8 { + (self.0 & 0x7) as u8 + } + + pub fn new(segment: u16, bus: u8, device: u8, function: u8) -> Self { + Self( + ((segment as u32) << 16) + | ((bus as u32) << 8) + | (((device & 0x1f) as u32) << 3) + | (function & 0x7) as u32, + ) + } +} + +impl From for PciBdf { + fn from(bdf: u32) -> Self { + Self(bdf) + } +} + +impl From for u32 { + fn from(bdf: PciBdf) -> Self { + bdf.0 + } +} + +impl From<&PciBdf> for u32 { + fn from(bdf: &PciBdf) -> Self { + bdf.0 + } +} + +impl From for u16 { + fn from(bdf: PciBdf) -> Self { + (bdf.0 & 0xffff) as u16 + } +} + +impl From<&PciBdf> for u16 { + fn from(bdf: &PciBdf) -> Self { + (bdf.0 & 0xffff) as u16 + } +} + +impl Debug for PciBdf { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{:04x}:{:02x}:{:02x}.{:01x}", + self.segment(), + self.bus(), + self.device(), + self.function() + ) + } +} + +impl Display for PciBdf { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{:04x}:{:02x}:{:02x}.{:01x}", + self.segment(), + self.bus(), + self.device(), + self.function() + ) + } +} + +/// Errors associated with parsing a BDF string. +#[derive(Debug, thiserror::Error, displaydoc::Display)] +pub enum PciBdfParseError { + /// Unable to parse bus/device/function number hex: {0} + InvalidHex(#[from] ParseIntError), + /// Invalid format: {0} (expected format: 0000:00:00.0) + InvalidFormat(String), +} + +impl FromStr for PciBdf { + type Err = PciBdfParseError; + + fn from_str(s: &str) -> Result { + let items: Vec<&str> = s.split('.').collect(); + if items.len() != 2 { + return Err(PciBdfParseError::InvalidFormat(s.to_string())); + } + let function = u8::from_str_radix(items[1], 16)?; + let items: Vec<&str> = items[0].split(':').collect(); + if items.len() != 3 { + return Err(PciBdfParseError::InvalidFormat(s.to_string())); + } + let segment = u16::from_str_radix(items[0], 16)?; + let bus = u8::from_str_radix(items[1], 16)?; + let device = u8::from_str_radix(items[2], 16)?; + Ok(PciBdf::new(segment, bus, device, function)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_pci_bdf_new() { + let bdf = PciBdf::new(0x1234, 0x56, 0x1f, 0x7); + assert_eq!(bdf.segment(), 0x1234); + assert_eq!(bdf.bus(), 0x56); + assert_eq!(bdf.device(), 0x1f); + assert_eq!(bdf.function(), 0x7); + } + + #[test] + fn test_pci_bdf_from_u32() { + let bdf = PciBdf::from(0x12345678); + assert_eq!(bdf.segment(), 0x1234); + assert_eq!(bdf.bus(), 0x56); + assert_eq!(bdf.device(), 0x0f); + assert_eq!(bdf.function(), 0x0); + } + + #[test] + fn test_pci_bdf_to_u32() { + let bdf = PciBdf::new(0x1234, 0x56, 0x1f, 0x7); + let val: u32 = bdf.into(); + assert_eq!(val, 0x123456ff); + } + + #[test] + fn test_pci_bdf_to_u16() { + let bdf = PciBdf::new(0x1234, 0x56, 0x1f, 0x7); + let val: u16 = bdf.into(); + assert_eq!(val, 0x56ff); + } + + #[test] + fn test_pci_bdf_from_str_valid() { + let bdf = PciBdf::from_str("1234:56:1f.7").unwrap(); + assert_eq!(bdf.segment(), 0x1234); + assert_eq!(bdf.bus(), 0x56); + assert_eq!(bdf.device(), 0x1f); + assert_eq!(bdf.function(), 0x7); + } + + #[test] + fn test_pci_bdf_from_str_zero() { + let bdf = PciBdf::from_str("0000:00:00.0").unwrap(); + assert_eq!(bdf.segment(), 0); + assert_eq!(bdf.bus(), 0); + assert_eq!(bdf.device(), 0); + assert_eq!(bdf.function(), 0); + } + + #[test] + fn test_pci_bdf_from_str_invalid_format() { + assert!(matches!( + PciBdf::from_str("invalid"), + Err(PciBdfParseError::InvalidFormat(_)) + )); + assert!(matches!( + PciBdf::from_str("1234:56"), + Err(PciBdfParseError::InvalidFormat(_)) + )); + assert!(matches!( + PciBdf::from_str("1234:56:78:9a.b"), + Err(PciBdfParseError::InvalidFormat(_)) + )); + } + + #[test] + fn test_pci_bdf_from_str_invalid_hex() { + assert!(matches!( + PciBdf::from_str("xxxx:00:00.0"), + Err(PciBdfParseError::InvalidHex(_)) + )); + assert!(matches!( + PciBdf::from_str("0000:xx:00.0"), + Err(PciBdfParseError::InvalidHex(_)) + )); + assert!(matches!( + PciBdf::from_str("0000:00:xx.0"), + Err(PciBdfParseError::InvalidHex(_)) + )); + assert!(matches!( + PciBdf::from_str("0000:00:00.x"), + Err(PciBdfParseError::InvalidHex(_)) + )); + } + + #[test] + fn test_pci_bdf_display() { + let bdf = PciBdf::new(0x1234, 0x56, 0x1f, 0x7); + assert_eq!(format!("{}", bdf), "1234:56:1f.7"); + } + + #[test] + fn test_pci_bdf_debug() { + let bdf = PciBdf::new(0x1234, 0x56, 0x1f, 0x7); + assert_eq!(format!("{:?}", bdf), "1234:56:1f.7"); + } + + #[test] + fn test_pci_bdf_partial_eq() { + let bdf1 = PciBdf::new(0x1234, 0x56, 0x1f, 0x7); + let bdf2 = PciBdf::new(0x1234, 0x56, 0x1f, 0x7); + let bdf3 = PciBdf::new(0x1234, 0x56, 0x1f, 0x6); + assert_eq!(bdf1, bdf2); + assert_ne!(bdf1, bdf3); + } + + #[test] + fn test_pci_bdf_partial_ord() { + let bdf1 = PciBdf::new(0x1234, 0x56, 0x1f, 0x6); + let bdf2 = PciBdf::new(0x1234, 0x56, 0x1f, 0x7); + assert!(bdf1 < bdf2); + } + + #[test] + fn test_pci_bdf_deserialize_ok() { + // Test deserializer + let visitor = PciBdfVisitor; + let result = visitor + .visit_str::("1234:56:1f.7") + .unwrap(); + assert_eq!(result, PciBdf::new(0x1234, 0x56, 0x1f, 0x7)); + } + + #[test] + fn test_pci_bdf_deserialize_invalid() { + // Test deserializer with invalid input returns error + let visitor = PciBdfVisitor; + assert!(visitor + .visit_str::("invalid") + .is_err()); + } + + #[test] + fn test_pci_bdf_serialize() { + // Test serializer using serde_test + let bdf = PciBdf::new(0x1234, 0x56, 0x1f, 0x7); + serde_test::assert_tokens(&bdf, &[serde_test::Token::Str("1234:56:1f.7")]); + } +} diff --git a/src/pci/src/msix.rs b/src/pci/src/msix.rs new file mode 100644 index 00000000000..50abeaf9737 --- /dev/null +++ b/src/pci/src/msix.rs @@ -0,0 +1,889 @@ +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause +// + +use std::sync::Arc; +use std::{io, result}; + +use byteorder::{ByteOrder, LittleEndian}; +use serde::{Deserialize, Serialize}; +use vm_device::interrupt::{ + InterruptIndex, InterruptSourceConfig, InterruptSourceGroup, MsiIrqSourceConfig, +}; +use vm_memory::ByteValued; + +use crate::{PciCapability, PciCapabilityId}; + +const MAX_MSIX_VECTORS_PER_DEVICE: u16 = 2048; +const MSIX_TABLE_ENTRIES_MODULO: u64 = 16; +const MSIX_PBA_ENTRIES_MODULO: u64 = 8; +const BITS_PER_PBA_ENTRY: usize = 64; +const FUNCTION_MASK_BIT: u8 = 14; +const MSIX_ENABLE_BIT: u8 = 15; + +#[derive(Debug, thiserror::Error, displaydoc::Display)] +pub enum Error { + /// Failed enabling the interrupt route. + EnableInterruptRoute(io::Error), + /// Failed updating the interrupt route. + UpdateInterruptRoute(io::Error), +} + +#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] +pub struct MsixTableEntry { + pub msg_addr_lo: u32, + pub msg_addr_hi: u32, + pub msg_data: u32, + pub vector_ctl: u32, +} + +impl MsixTableEntry { + pub fn masked(&self) -> bool { + self.vector_ctl & 0x1 == 0x1 + } +} + +impl Default for MsixTableEntry { + fn default() -> Self { + MsixTableEntry { + msg_addr_lo: 0, + msg_addr_hi: 0, + msg_data: 0, + vector_ctl: 0x1, + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct MsixConfigState { + table_entries: Vec, + pba_entries: Vec, + masked: bool, + enabled: bool, +} + +pub struct MsixConfig { + pub table_entries: Vec, + pub pba_entries: Vec, + pub devid: u32, + pub interrupt_source_group: Arc, + pub masked: bool, + pub enabled: bool, +} + +impl std::fmt::Debug for MsixConfig { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("MsixConfig") + .field("table_entries", &self.table_entries) + .field("pba_entries", &self.pba_entries) + .field("devid", &self.devid) + .field("masked", &self.masked) + .field("enabled", &self.enabled) + .finish() + } +} + +impl MsixConfig { + pub fn new( + msix_vectors: u16, + interrupt_source_group: Arc, + devid: u32, + state: Option, + ) -> result::Result { + assert!(msix_vectors <= MAX_MSIX_VECTORS_PER_DEVICE); + + let (table_entries, pba_entries, masked, enabled) = if let Some(state) = state { + if state.enabled && !state.masked { + for (idx, table_entry) in state.table_entries.iter().enumerate() { + if table_entry.masked() { + continue; + } + + let config = MsiIrqSourceConfig { + high_addr: table_entry.msg_addr_hi, + low_addr: table_entry.msg_addr_lo, + data: table_entry.msg_data, + devid, + }; + + interrupt_source_group + .update( + idx as InterruptIndex, + InterruptSourceConfig::MsiIrq(config), + state.masked, + true, + ) + .map_err(Error::UpdateInterruptRoute)?; + + interrupt_source_group + .enable() + .map_err(Error::EnableInterruptRoute)?; + } + } + + ( + state.table_entries, + state.pba_entries, + state.masked, + state.enabled, + ) + } else { + let mut table_entries: Vec = Vec::new(); + table_entries.resize_with(msix_vectors as usize, Default::default); + let mut pba_entries: Vec = Vec::new(); + let num_pba_entries: usize = (msix_vectors as usize).div_ceil(BITS_PER_PBA_ENTRY); + pba_entries.resize_with(num_pba_entries, Default::default); + + (table_entries, pba_entries, true, false) + }; + + Ok(MsixConfig { + table_entries, + pba_entries, + devid, + interrupt_source_group, + masked, + enabled, + }) + } + + pub fn state(&self) -> MsixConfigState { + MsixConfigState { + table_entries: self.table_entries.clone(), + pba_entries: self.pba_entries.clone(), + masked: self.masked, + enabled: self.enabled, + } + } + + pub fn set_msg_ctl(&mut self, reg: u16) { + let old_masked = self.masked; + let old_enabled = self.enabled; + + self.masked = ((reg >> FUNCTION_MASK_BIT) & 1u16) == 1u16; + self.enabled = ((reg >> MSIX_ENABLE_BIT) & 1u16) == 1u16; + + // Update interrupt routing + if old_masked != self.masked || old_enabled != self.enabled { + if self.enabled && !self.masked { + debug!("MSI-X enabled for device 0x{:x}", self.devid); + for (idx, table_entry) in self.table_entries.iter().enumerate() { + let config = MsiIrqSourceConfig { + high_addr: table_entry.msg_addr_hi, + low_addr: table_entry.msg_addr_lo, + data: table_entry.msg_data, + devid: self.devid, + }; + + if let Err(e) = self.interrupt_source_group.update( + idx as InterruptIndex, + InterruptSourceConfig::MsiIrq(config), + table_entry.masked(), + true, + ) { + error!("Failed updating vector: {:?}", e); + } + } + } else if old_enabled || !old_masked { + debug!("MSI-X disabled for device 0x{:x}", self.devid); + if let Err(e) = self.interrupt_source_group.disable() { + error!("Failed disabling irq_fd: {:?}", e); + } + } + } + + // If the Function Mask bit was set, and has just been cleared, it's + // important to go through the entire PBA to check if there was any + // pending MSI-X message to inject, given that the vector is not + // masked. + if old_masked && !self.masked { + for (index, entry) in self.table_entries.clone().iter().enumerate() { + if !entry.masked() && self.get_pba_bit(index as u16) == 1 { + self.inject_msix_and_clear_pba(index); + } + } + } + } + + pub fn read_table(&self, offset: u64, data: &mut [u8]) { + assert!(data.len() <= 8); + + let index: usize = (offset / MSIX_TABLE_ENTRIES_MODULO) as usize; + let modulo_offset = offset % MSIX_TABLE_ENTRIES_MODULO; + + if index >= self.table_entries.len() { + warn!("Invalid MSI-X table entry index {index}"); + data.fill(0xff); + return; + } + + match data.len() { + 4 => { + let value = match modulo_offset { + 0x0 => self.table_entries[index].msg_addr_lo, + 0x4 => self.table_entries[index].msg_addr_hi, + 0x8 => self.table_entries[index].msg_data, + 0xc => self.table_entries[index].vector_ctl, + off => { + warn!("msi-x: invalid offset in table entry read: {off}"); + 0xffff_ffff + } + }; + + LittleEndian::write_u32(data, value); + } + 8 => { + let value = match modulo_offset { + 0x0 => { + (u64::from(self.table_entries[index].msg_addr_hi) << 32) + | u64::from(self.table_entries[index].msg_addr_lo) + } + 0x8 => { + (u64::from(self.table_entries[index].vector_ctl) << 32) + | u64::from(self.table_entries[index].msg_data) + } + off => { + warn!("msi-x: invalid offset in table entry read: {off}"); + 0xffff_ffff_ffff_ffff + } + }; + + LittleEndian::write_u64(data, value); + } + len => { + warn!("msi-x: invalid length in table entry read: {len}"); + data.fill(0xff); + } + } + } + + pub fn write_table(&mut self, offset: u64, data: &[u8]) { + assert!(data.len() <= 8); + + let index: usize = (offset / MSIX_TABLE_ENTRIES_MODULO) as usize; + let modulo_offset = offset % MSIX_TABLE_ENTRIES_MODULO; + + if index >= self.table_entries.len() { + warn!("msi-x: invalid table entry index {index}"); + return; + } + + // Store the value of the entry before modification + let old_entry = self.table_entries[index].clone(); + + match data.len() { + 4 => { + let value = LittleEndian::read_u32(data); + match modulo_offset { + 0x0 => self.table_entries[index].msg_addr_lo = value, + 0x4 => self.table_entries[index].msg_addr_hi = value, + 0x8 => self.table_entries[index].msg_data = value, + 0xc => { + self.table_entries[index].vector_ctl = value; + } + off => warn!("msi-x: invalid offset in table entry write: {off}"), + }; + } + 8 => { + let value = LittleEndian::read_u64(data); + match modulo_offset { + 0x0 => { + self.table_entries[index].msg_addr_lo = (value & 0xffff_ffffu64) as u32; + self.table_entries[index].msg_addr_hi = (value >> 32) as u32; + } + 0x8 => { + self.table_entries[index].msg_data = (value & 0xffff_ffffu64) as u32; + self.table_entries[index].vector_ctl = (value >> 32) as u32; + } + off => warn!("msi-x: invalid offset in table entry write: {off}"), + }; + } + len => warn!("msi-x: invalid length in table entry write: {len}"), + }; + + let table_entry = &self.table_entries[index]; + + // Optimisation to avoid excessive updates + if &old_entry == table_entry { + return; + } + + // Update interrupt routes + // Optimisation: only update routes if the entry is not masked; + // this is safe because if the entry is masked (starts masked as per spec) + // in the table then it won't be triggered. + if self.enabled && !self.masked && !table_entry.masked() { + let config = MsiIrqSourceConfig { + high_addr: table_entry.msg_addr_hi, + low_addr: table_entry.msg_addr_lo, + data: table_entry.msg_data, + devid: self.devid, + }; + + if let Err(e) = self.interrupt_source_group.update( + index as InterruptIndex, + InterruptSourceConfig::MsiIrq(config), + table_entry.masked(), + true, + ) { + error!("Failed updating vector: {:?}", e); + } + } + + // After the MSI-X table entry has been updated, it is necessary to + // check if the vector control masking bit has changed. In case the + // bit has been flipped from 1 to 0, we need to inject a MSI message + // if the corresponding pending bit from the PBA is set. Once the MSI + // has been injected, the pending bit in the PBA needs to be cleared. + // All of this is valid only if MSI-X has not been masked for the whole + // device. + + // Check if bit has been flipped + if !self.masked + && self.enabled + && old_entry.masked() + && !table_entry.masked() + && self.get_pba_bit(index as u16) == 1 + { + self.inject_msix_and_clear_pba(index); + } + } + + pub fn read_pba(&self, offset: u64, data: &mut [u8]) { + let index: usize = (offset / MSIX_PBA_ENTRIES_MODULO) as usize; + let modulo_offset = offset % MSIX_PBA_ENTRIES_MODULO; + + if index >= self.pba_entries.len() { + warn!("msi-x: invalid PBA entry index {index}"); + data.fill(0xff); + return; + } + + match data.len() { + 4 => { + let value: u32 = match modulo_offset { + 0x0 => (self.pba_entries[index] & 0xffff_ffffu64) as u32, + 0x4 => (self.pba_entries[index] >> 32) as u32, + off => { + warn!("msi-x: invalid offset in pba entry read: {off}"); + 0xffff_ffff + } + }; + + LittleEndian::write_u32(data, value); + } + 8 => { + let value: u64 = match modulo_offset { + 0x0 => self.pba_entries[index], + off => { + warn!("msi-x: invalid offset in pba entry read: {off}"); + 0xffff_ffff_ffff_ffff + } + }; + + LittleEndian::write_u64(data, value); + } + len => { + warn!("msi-x: invalid length in table entry read: {len}"); + data.fill(0xff); + } + } + } + + pub fn write_pba(&mut self, _offset: u64, _data: &[u8]) { + error!("Pending Bit Array is read only"); + } + + pub fn set_pba_bit(&mut self, vector: u16, reset: bool) { + assert!(vector < MAX_MSIX_VECTORS_PER_DEVICE); + + if (vector as usize) >= self.table_entries.len() { + return; + } + + let index: usize = (vector as usize) / BITS_PER_PBA_ENTRY; + let shift: usize = (vector as usize) % BITS_PER_PBA_ENTRY; + let mut mask: u64 = 1u64 << shift; + + if reset { + mask = !mask; + self.pba_entries[index] &= mask; + } else { + self.pba_entries[index] |= mask; + } + } + + fn get_pba_bit(&self, vector: u16) -> u8 { + assert!(vector < MAX_MSIX_VECTORS_PER_DEVICE); + + if (vector as usize) >= self.table_entries.len() { + return 0xff; + } + + let index: usize = (vector as usize) / BITS_PER_PBA_ENTRY; + let shift: usize = (vector as usize) % BITS_PER_PBA_ENTRY; + + ((self.pba_entries[index] >> shift) & 0x0000_0001u64) as u8 + } + + fn inject_msix_and_clear_pba(&mut self, vector: usize) { + // Inject the MSI message + match self + .interrupt_source_group + .trigger(vector as InterruptIndex) + { + Ok(_) => debug!("MSI-X injected on vector control flip"), + Err(e) => error!("failed to inject MSI-X: {}", e), + } + + // Clear the bit from PBA + self.set_pba_bit(vector as u16, true); + } +} + +#[allow(dead_code)] +#[repr(C, packed)] +#[derive(Clone, Copy, Default, Serialize, Deserialize)] +pub struct MsixCap { + // Message Control Register + // 10-0: MSI-X Table size + // 13-11: Reserved + // 14: Mask. Mask all MSI-X when set. + // 15: Enable. Enable all MSI-X when set. + pub msg_ctl: u16, + // Table. Contains the offset and the BAR indicator (BIR) + // 2-0: Table BAR indicator (BIR). Can be 0 to 5. + // 31-3: Table offset in the BAR pointed by the BIR. + pub table: u32, + // Pending Bit Array. Contains the offset and the BAR indicator (BIR) + // 2-0: PBA BAR indicator (BIR). Can be 0 to 5. + // 31-3: PBA offset in the BAR pointed by the BIR. + pub pba: u32, +} + +// SAFETY: All members are simple numbers and any value is valid. +unsafe impl ByteValued for MsixCap {} + +impl PciCapability for MsixCap { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::MsiX + } +} + +impl MsixCap { + pub fn new( + table_pci_bar: u8, + table_size: u16, + table_off: u32, + pba_pci_bar: u8, + pba_off: u32, + ) -> Self { + assert!(table_size < MAX_MSIX_VECTORS_PER_DEVICE); + + // Set the table size and enable MSI-X. + let msg_ctl: u16 = 0x8000u16 + table_size - 1; + + MsixCap { + msg_ctl, + table: (table_off & 0xffff_fff8u32) | u32::from(table_pci_bar & 0x7u8), + pba: (pba_off & 0xffff_fff8u32) | u32::from(pba_pci_bar & 0x7u8), + } + } +} + +#[cfg(test)] +mod tests { + use std::sync::atomic::{AtomicUsize, Ordering}; + + use vmm_sys_util::eventfd::EventFd; + + use super::*; + + #[derive(Debug)] + struct MockInterrupt { + trigger_cnt: [AtomicUsize; 2], + update_cnt: [AtomicUsize; 2], + event_fd: [EventFd; 2], + } + + impl MockInterrupt { + fn new() -> Self { + MockInterrupt { + trigger_cnt: [AtomicUsize::new(0), AtomicUsize::new(0)], + update_cnt: [AtomicUsize::new(0), AtomicUsize::new(0)], + event_fd: [ + EventFd::new(libc::EFD_NONBLOCK).unwrap(), + EventFd::new(libc::EFD_NONBLOCK).unwrap(), + ], + } + } + + fn interrupt_cnt(&self, index: InterruptIndex) -> usize { + self.trigger_cnt[index as usize].load(Ordering::SeqCst) + } + + fn update_cnt(&self, index: InterruptIndex) -> usize { + self.update_cnt[index as usize].load(Ordering::SeqCst) + } + } + + impl InterruptSourceGroup for MockInterrupt { + fn trigger(&self, index: InterruptIndex) -> vm_device::interrupt::Result<()> { + self.trigger_cnt[index as usize].fetch_add(1, Ordering::SeqCst); + Ok(()) + } + + fn notifier(&self, index: InterruptIndex) -> Option<&EventFd> { + self.event_fd.get(index as usize) + } + + fn update( + &self, + index: InterruptIndex, + _config: InterruptSourceConfig, + _masked: bool, + _set_gsi: bool, + ) -> vm_device::interrupt::Result<()> { + self.update_cnt[index as usize].fetch_add(1, Ordering::SeqCst); + Ok(()) + } + + fn set_gsi(&self) -> vm_device::interrupt::Result<()> { + Ok(()) + } + } + + #[test] + #[should_panic] + fn test_too_many_vectors() { + MsixConfig::new(2049, Arc::new(MockInterrupt::new()), 0x42, None).unwrap(); + } + + #[test] + fn test_new_msix_config() { + let vectors = Arc::new(MockInterrupt::new()); + let config = MsixConfig::new(2, vectors.clone(), 0x42, None).unwrap(); + assert_eq!(config.devid, 0x42); + assert!(config.masked); + assert!(!config.enabled); + assert_eq!(config.table_entries.len(), 2); + assert_eq!(config.pba_entries.len(), 1); + } + + #[test] + fn test_enable_msix_vectors() { + let vectors = Arc::new(MockInterrupt::new()); + let mut config = MsixConfig::new(2, vectors.clone(), 0x42, None).unwrap(); + + assert!(!config.enabled); + assert!(config.masked); + + // Bit 15 marks whether MSI-X is enabled + // Bit 14 marks whether vectors are masked + config.set_msg_ctl(0x8000); + assert!(config.enabled); + assert!(!config.masked); + + config.set_msg_ctl(0x4000); + assert!(!config.enabled); + assert!(config.masked); + + config.set_msg_ctl(0xC000); + assert!(config.enabled); + assert!(config.masked); + + config.set_msg_ctl(0x0); + assert!(!config.enabled); + assert!(!config.masked); + } + + #[test] + #[should_panic] + fn test_table_access_read_too_big() { + let vectors = Arc::new(MockInterrupt::new()); + let config = MsixConfig::new(2, vectors.clone(), 0x42, None).unwrap(); + let mut buffer = [0u8; 16]; + + config.read_table(0, &mut buffer); + } + + #[test] + fn test_read_table_past_end() { + let vectors = Arc::new(MockInterrupt::new()); + let config = MsixConfig::new(2, vectors.clone(), 0x42, None).unwrap(); + let mut buffer = [0u8; 8]; + + // We have 2 vectors (16 bytes each), so we should be able to read up to 32 bytes. + // Past that the device should respond with all 1s + config.read_table(32, &mut buffer); + assert_eq!(buffer, [0xff; 8]); + } + + #[test] + fn test_read_table_bad_length() { + let vectors = Arc::new(MockInterrupt::new()); + let config = MsixConfig::new(2, vectors.clone(), 0x42, None).unwrap(); + let mut buffer = [0u8; 8]; + + // We can either read 4 or 8 bytes + config.read_table(0, &mut buffer[..0]); + assert_eq!(buffer, [0x0; 8]); + config.read_table(0, &mut buffer[..1]); + assert_eq!(buffer[..1], [0xff; 1]); + config.read_table(0, &mut buffer[..2]); + assert_eq!(buffer[..2], [0xff; 2]); + config.read_table(0, &mut buffer[..3]); + assert_eq!(buffer[..3], [0xff; 3]); + config.read_table(0, &mut buffer[..5]); + assert_eq!(buffer[..5], [0xff; 5]); + config.read_table(0, &mut buffer[..6]); + assert_eq!(buffer[..6], [0xff; 6]); + config.read_table(0, &mut buffer[..7]); + assert_eq!(buffer[..7], [0xff; 7]); + config.read_table(0, &mut buffer[..4]); + assert_eq!(buffer, u64::to_le_bytes(0x00ff_ffff_0000_0000)); + config.read_table(0, &mut buffer); + assert_eq!(buffer, u64::to_le_bytes(0)); + } + + #[test] + fn test_access_table() { + let vectors = Arc::new(MockInterrupt::new()); + let mut config = MsixConfig::new(2, vectors.clone(), 0x42, None).unwrap(); + // enabled and not masked + config.set_msg_ctl(0x8000); + assert_eq!(vectors.update_cnt(0), 1); + assert_eq!(vectors.update_cnt(1), 1); + let mut buffer = [0u8; 8]; + + // Write first vector's address with a single 8-byte write + config.write_table(0, &u64::to_le_bytes(0x0000_1312_0000_1110)); + // It's still masked so shouldn't be updated + assert_eq!(vectors.update_cnt(0), 1); + assert_eq!(vectors.update_cnt(1), 1); + // Same for control and message data + config.write_table(8, &u64::to_le_bytes(0x0_0000_0020)); + // Now, we enabled it, so we should see an update + assert_eq!(vectors.update_cnt(0), 2); + assert_eq!(vectors.update_cnt(1), 1); + + // Write second vector's fields with 4-byte writes + // low 32 bits of the address + config.write_table(16, &u32::to_le_bytes(0x4241)); + assert_eq!(vectors.update_cnt(0), 2); + // Still masked + assert_eq!(vectors.update_cnt(1), 1); + // high 32 bits of the address + config.write_table(20, &u32::to_le_bytes(0x4443)); + assert_eq!(vectors.update_cnt(0), 2); + // Still masked + assert_eq!(vectors.update_cnt(1), 1); + // message data + config.write_table(24, &u32::to_le_bytes(0x21)); + assert_eq!(vectors.update_cnt(0), 2); + // Still masked + assert_eq!(vectors.update_cnt(1), 1); + // vector control + config.write_table(28, &u32::to_le_bytes(0x0)); + assert_eq!(vectors.update_cnt(0), 2); + assert_eq!(vectors.update_cnt(1), 2); + + assert_eq!(config.table_entries[0].msg_addr_hi, 0x1312); + assert_eq!(config.table_entries[0].msg_addr_lo, 0x1110); + assert_eq!(config.table_entries[0].msg_data, 0x20); + assert_eq!(config.table_entries[0].vector_ctl, 0); + + assert_eq!(config.table_entries[1].msg_addr_hi, 0x4443); + assert_eq!(config.table_entries[1].msg_addr_lo, 0x4241); + assert_eq!(config.table_entries[1].msg_data, 0x21); + assert_eq!(config.table_entries[1].vector_ctl, 0); + + assert_eq!(config.table_entries.len(), 2); + assert_eq!(config.pba_entries.len(), 1); + + // reading at a bad offset should return all 1s + config.read_table(1, &mut buffer[..4]); + assert_eq!(buffer[..4], [0xff; 4]); + // read low address for first vector + config.read_table(0, &mut buffer[..4]); + assert_eq!( + buffer[..4], + u32::to_le_bytes(config.table_entries[0].msg_addr_lo) + ); + // read the high address for first vector + config.read_table(4, &mut buffer[4..]); + assert_eq!(0x0000_1312_0000_1110, u64::from_le_bytes(buffer)); + // read msg_data from second vector + config.read_table(24, &mut buffer[..4]); + assert_eq!(u32::to_le_bytes(0x21), &buffer[..4]); + // read vector control for second vector + config.read_table(28, &mut buffer[..4]); + assert_eq!(u32::to_le_bytes(0x0), &buffer[..4]); + + // reading with 8 bytes at bad offset should also return all 1s + config.read_table(19, &mut buffer); + assert_eq!(buffer, [0xff; 8]); + + // Read the second vector's address using an 8 byte read + config.read_table(16, &mut buffer); + assert_eq!(0x0000_4443_0000_4241, u64::from_le_bytes(buffer)); + + // Read the first vector's ctrl and data with a single 8 byte read + config.read_table(8, &mut buffer); + assert_eq!(0x0_0000_0020, u64::from_le_bytes(buffer)); + + // If we mask the interrupts we shouldn't see any update + config.write_table(12, &u32::to_le_bytes(0x1)); + config.write_table(28, &u32::to_le_bytes(0x1)); + assert_eq!(vectors.update_cnt(0), 2); + assert_eq!(vectors.update_cnt(1), 2); + + // Un-masking them should update them + config.write_table(12, &u32::to_le_bytes(0x0)); + config.write_table(28, &u32::to_le_bytes(0x0)); + assert_eq!(vectors.update_cnt(0), 3); + assert_eq!(vectors.update_cnt(1), 3); + + // Setting up the same config should have no effect + config.write_table(12, &u32::to_le_bytes(0x0)); + config.write_table(28, &u32::to_le_bytes(0x0)); + assert_eq!(vectors.update_cnt(0), 3); + assert_eq!(vectors.update_cnt(1), 3); + } + + #[test] + #[should_panic] + fn test_table_access_write_too_big() { + let vectors = Arc::new(MockInterrupt::new()); + let mut config = MsixConfig::new(2, vectors.clone(), 0x42, None).unwrap(); + let buffer = [0u8; 16]; + + config.write_table(0, &buffer); + } + + #[test] + fn test_pba_read_too_big() { + let vectors = Arc::new(MockInterrupt::new()); + let config = MsixConfig::new(2, vectors.clone(), 0x42, None).unwrap(); + let mut buffer = [0u8; 16]; + + config.read_pba(0, &mut buffer); + assert_eq!(buffer, [0xff; 16]); + } + + #[test] + fn test_pba_invalid_offset() { + let vectors = Arc::new(MockInterrupt::new()); + let config = MsixConfig::new(2, vectors.clone(), 0x42, None).unwrap(); + let mut buffer = [0u8; 8]; + + // Past the end of the PBA array + config.read_pba(128, &mut buffer); + assert_eq!(buffer, [0xffu8; 8]); + + // Invalid offset within a valid entry + let mut buffer = [0u8; 8]; + config.read_pba(3, &mut buffer[..4]); + assert_eq!(buffer[..4], [0xffu8; 4]); + config.read_pba(3, &mut buffer); + assert_eq!(buffer, [0xffu8; 8]); + } + + #[test] + #[should_panic] + fn test_set_pba_bit_vector_too_big() { + let vectors = Arc::new(MockInterrupt::new()); + let mut config = MsixConfig::new(2, vectors.clone(), 0x42, None).unwrap(); + + config.set_pba_bit(2048, false); + } + + #[test] + #[should_panic] + fn test_get_pba_bit_vector_too_big() { + let vectors = Arc::new(MockInterrupt::new()); + let config = MsixConfig::new(2, vectors.clone(), 0x42, None).unwrap(); + + config.get_pba_bit(2048); + } + + #[test] + fn test_pba_bit_invalid_vector() { + let vectors = Arc::new(MockInterrupt::new()); + let mut config = MsixConfig::new(2, vectors.clone(), 0x42, None).unwrap(); + + // We have two vectors, so setting the pending bit for the third one + // should be ignored + config.set_pba_bit(2, false); + assert_eq!(config.pba_entries[0], 0); + + // Same for getting the bit + assert_eq!(config.get_pba_bit(2), 0xff); + } + + #[test] + fn test_pba_read() { + let vectors = Arc::new(MockInterrupt::new()); + let mut config = MsixConfig::new(128, vectors.clone(), 0x42, None).unwrap(); + let mut buffer = [0u8; 8]; + + config.set_pba_bit(1, false); + assert_eq!(config.pba_entries[0], 2); + assert_eq!(config.pba_entries[1], 0); + config.read_pba(0, &mut buffer); + assert_eq!(0x2, u64::from_le_bytes(buffer)); + + let mut buffer = [0u8; 4]; + config.set_pba_bit(96, false); + assert_eq!(config.pba_entries[0], 2); + assert_eq!(config.pba_entries[1], 0x1_0000_0000); + config.read_pba(8, &mut buffer); + assert_eq!(0x0, u32::from_le_bytes(buffer)); + config.read_pba(12, &mut buffer); + assert_eq!(0x1, u32::from_le_bytes(buffer)); + } + + #[test] + fn test_pending_interrupt() { + let vectors = Arc::new(MockInterrupt::new()); + let mut config = MsixConfig::new(2, vectors.clone(), 0x42, None).unwrap(); + config.set_pba_bit(1, false); + assert_eq!(config.get_pba_bit(1), 1); + // Enable MSI-X vector and unmask interrupts + config.set_msg_ctl(0x8000); + + // Individual vectors are still masked, so no change + assert_eq!(vectors.interrupt_cnt(0), 0); + assert_eq!(vectors.interrupt_cnt(1), 0); + + // Enable all vectors + config.write_table(8, &u64::to_le_bytes(0x0_0000_0020)); + config.write_table(24, &u64::to_le_bytes(0x0_0000_0020)); + + // Vector one had a pending bit, so we must have triggered an interrupt for it + // and cleared the pending bit + assert_eq!(vectors.interrupt_cnt(0), 0); + assert_eq!(vectors.interrupt_cnt(1), 1); + assert_eq!(config.get_pba_bit(1), 0); + + // Check that interrupt is sent as well for enabled vectors once we unmask from + // Message Control + + // Mask vectors and set pending bit for vector 0 + config.set_msg_ctl(0xc000); + config.set_pba_bit(0, false); + assert_eq!(vectors.interrupt_cnt(0), 0); + assert_eq!(vectors.interrupt_cnt(1), 1); + + // Unmask them + config.set_msg_ctl(0x8000); + assert_eq!(vectors.interrupt_cnt(0), 1); + assert_eq!(vectors.interrupt_cnt(1), 1); + assert_eq!(config.get_pba_bit(0), 0); + } +} diff --git a/src/vm-device/Cargo.toml b/src/vm-device/Cargo.toml new file mode 100644 index 00000000000..b6471ab9f6a --- /dev/null +++ b/src/vm-device/Cargo.toml @@ -0,0 +1,16 @@ +[package] +authors = ["The Cloud Hypervisor Authors"] +edition = "2021" +name = "vm-device" +version = "0.1.0" +license = "Apache-2.0 AND BSD-3-Clause" + +[lib] +bench = false + +[features] +default = [] + +[dependencies] +serde = { version = "1.0.208", features = ["derive", "rc"] } +vmm-sys-util = { version = "0.14.0", features = ["with-serde"] } diff --git a/src/vm-device/src/bus.rs b/src/vm-device/src/bus.rs new file mode 100644 index 00000000000..31880d354bb --- /dev/null +++ b/src/vm-device/src/bus.rs @@ -0,0 +1,407 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. + +//! Handles routing to devices in an address space. + +use std::cmp::Ordering; +use std::collections::btree_map::BTreeMap; +use std::sync::{Arc, Barrier, Mutex, RwLock, Weak}; +use std::{convert, error, fmt, io, result}; + +/// Trait for devices that respond to reads or writes in an arbitrary address space. +/// +/// The device does not care where it exists in address space as each method is only given an offset +/// into its allocated portion of address space. +#[allow(unused_variables)] +pub trait BusDevice: Send { + /// Reads at `offset` from this device + fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) {} + /// Writes at `offset` into this device + fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option> { + None + } +} + +#[allow(unused_variables)] +pub trait BusDeviceSync: Send + Sync { + /// Reads at `offset` from this device + fn read(&self, base: u64, offset: u64, data: &mut [u8]) {} + /// Writes at `offset` into this device + fn write(&self, base: u64, offset: u64, data: &[u8]) -> Option> { + None + } +} + +impl BusDeviceSync for Mutex { + /// Reads at `offset` from this device + fn read(&self, base: u64, offset: u64, data: &mut [u8]) { + self.lock() + .expect("Failed to acquire device lock") + .read(base, offset, data) + } + /// Writes at `offset` into this device + fn write(&self, base: u64, offset: u64, data: &[u8]) -> Option> { + self.lock() + .expect("Failed to acquire device lock") + .write(base, offset, data) + } +} + +#[derive(Debug)] +pub enum Error { + /// The insertion failed because the new device overlapped with an old device. + Overlap, + /// Failed to operate on zero sized range. + ZeroSizedRange, + /// Failed to find address range. + MissingAddressRange, +} + +pub type Result = result::Result; + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "bus_error: {self:?}") + } +} + +impl error::Error for Error {} + +impl convert::From for io::Error { + fn from(e: Error) -> Self { + io::Error::other(e) + } +} + +/// Holds a base and length representing the address space occupied by a `BusDevice`. +/// +/// * base - The address at which the range start. +/// * len - The length of the range in bytes. +#[derive(Debug, Copy, Clone)] +pub struct BusRange { + pub base: u64, + pub len: u64, +} + +impl BusRange { + /// Returns true if there is overlap with the given range. + pub fn overlaps(&self, base: u64, len: u64) -> bool { + self.base < (base + len) && base < self.base + self.len + } +} + +impl Eq for BusRange {} + +impl PartialEq for BusRange { + fn eq(&self, other: &BusRange) -> bool { + self.base == other.base + } +} + +impl Ord for BusRange { + fn cmp(&self, other: &BusRange) -> Ordering { + self.base.cmp(&other.base) + } +} + +impl PartialOrd for BusRange { + fn partial_cmp(&self, other: &BusRange) -> Option { + Some(self.cmp(other)) + } +} + +/// A device container for routing reads and writes over some address space. +/// +/// This doesn't have any restrictions on what kind of device or address space this applies to. The +/// only restriction is that no two devices can overlap in this address space. +#[derive(Default, Debug)] +pub struct Bus { + devices: RwLock>>, +} + +impl Bus { + /// Constructs an a bus with an empty address space. + pub fn new() -> Bus { + Bus { + devices: RwLock::new(BTreeMap::new()), + } + } + + fn first_before(&self, addr: u64) -> Option<(BusRange, Arc)> { + let devices = self.devices.read().unwrap(); + let (range, dev) = devices + .range(..=BusRange { base: addr, len: 1 }) + .next_back()?; + dev.upgrade().map(|d| (*range, d.clone())) + } + + #[allow(clippy::type_complexity)] + pub fn resolve(&self, addr: u64) -> Option<(u64, u64, Arc)> { + if let Some((range, dev)) = self.first_before(addr) { + let offset = addr - range.base; + if offset < range.len { + return Some((range.base, offset, dev)); + } + } + None + } + + pub fn insert(&self, device: Arc, base: u64, len: u64) -> Result<()> { + if len == 0 { + return Err(Error::ZeroSizedRange); + } + + // Reject all cases where the new device's range overlaps with an existing device. + if self + .devices + .read() + .unwrap() + .iter() + .any(|(range, _dev)| range.overlaps(base, len)) + { + return Err(Error::Overlap); + } + + if self + .devices + .write() + .unwrap() + .insert(BusRange { base, len }, Arc::downgrade(&device)) + .is_some() + { + return Err(Error::Overlap); + } + + Ok(()) + } + + /// Removes the device at the given address space range. + pub fn remove(&self, base: u64, len: u64) -> Result<()> { + if len == 0 { + return Err(Error::ZeroSizedRange); + } + + let bus_range = BusRange { base, len }; + + if self.devices.write().unwrap().remove(&bus_range).is_none() { + return Err(Error::MissingAddressRange); + } + + Ok(()) + } + + /// Removes all entries referencing the given device. + pub fn remove_by_device(&self, device: &Arc) -> Result<()> { + let mut device_list = self.devices.write().unwrap(); + let mut remove_key_list = Vec::new(); + + for (key, value) in device_list.iter() { + if Arc::ptr_eq(&value.upgrade().unwrap(), device) { + remove_key_list.push(*key); + } + } + + for key in remove_key_list.iter() { + device_list.remove(key); + } + + Ok(()) + } + + /// Updates the address range for an existing device. + pub fn update_range( + &self, + old_base: u64, + old_len: u64, + new_base: u64, + new_len: u64, + ) -> Result<()> { + // Retrieve the device corresponding to the range + let device = if let Some((_, _, dev)) = self.resolve(old_base) { + dev.clone() + } else { + return Err(Error::MissingAddressRange); + }; + + // Remove the old address range + self.remove(old_base, old_len)?; + + // Insert the new address range + self.insert(device, new_base, new_len) + } + + /// Reads data from the device that owns the range containing `addr` and puts it into `data`. + /// + /// Returns true on success, otherwise `data` is untouched. + pub fn read(&self, addr: u64, data: &mut [u8]) -> Result<()> { + if let Some((base, offset, dev)) = self.resolve(addr) { + // OK to unwrap as lock() failing is a serious error condition and should panic. + dev.read(base, offset, data); + Ok(()) + } else { + Err(Error::MissingAddressRange) + } + } + + /// Writes `data` to the device that owns the range containing `addr`. + /// + /// Returns true on success, otherwise `data` is untouched. + pub fn write(&self, addr: u64, data: &[u8]) -> Result>> { + if let Some((base, offset, dev)) = self.resolve(addr) { + // OK to unwrap as lock() failing is a serious error condition and should panic. + Ok(dev.write(base, offset, data)) + } else { + Err(Error::MissingAddressRange) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + struct DummyDevice; + impl BusDeviceSync for DummyDevice {} + + struct ConstantDevice; + impl BusDeviceSync for ConstantDevice { + fn read(&self, _base: u64, offset: u64, data: &mut [u8]) { + for (i, v) in data.iter_mut().enumerate() { + *v = (offset as u8) + (i as u8); + } + } + + fn write(&self, _base: u64, offset: u64, data: &[u8]) -> Option> { + for (i, v) in data.iter().enumerate() { + assert_eq!(*v, (offset as u8) + (i as u8)) + } + + None + } + } + + #[test] + fn bus_insert() { + let bus = Bus::new(); + let dummy = Arc::new(DummyDevice); + bus.insert(dummy.clone(), 0x10, 0).unwrap_err(); + bus.insert(dummy.clone(), 0x10, 0x10).unwrap(); + + let result = bus.insert(dummy.clone(), 0x0f, 0x10); + assert_eq!(format!("{result:?}"), "Err(Overlap)"); + + bus.insert(dummy.clone(), 0x10, 0x10).unwrap_err(); + bus.insert(dummy.clone(), 0x10, 0x15).unwrap_err(); + bus.insert(dummy.clone(), 0x12, 0x15).unwrap_err(); + bus.insert(dummy.clone(), 0x12, 0x01).unwrap_err(); + bus.insert(dummy.clone(), 0x0, 0x20).unwrap_err(); + bus.insert(dummy.clone(), 0x20, 0x05).unwrap(); + bus.insert(dummy.clone(), 0x25, 0x05).unwrap(); + bus.insert(dummy, 0x0, 0x10).unwrap(); + } + + #[test] + fn bus_remove() { + let bus = Bus::new(); + let dummy: Arc = Arc::new(DummyDevice); + + bus.remove(0x42, 0x0).unwrap_err(); + + bus.remove(0x13, 0x12).unwrap_err(); + + bus.insert(dummy.clone(), 0x13, 0x12).unwrap(); + bus.remove(0x42, 0x42).unwrap_err(); + bus.remove(0x13, 0x12).unwrap(); + + bus.insert(dummy.clone(), 0x16, 0x1).unwrap(); + bus.remove_by_device(&dummy).unwrap(); + bus.remove(0x16, 0x1).unwrap_err(); + } + + #[test] + #[allow(clippy::redundant_clone)] + fn bus_read_write() { + let bus = Bus::new(); + let dummy = Arc::new(DummyDevice); + bus.insert(dummy.clone(), 0x10, 0x10).unwrap(); + bus.read(0x10, &mut [0, 0, 0, 0]).unwrap(); + bus.write(0x10, &[0, 0, 0, 0]).unwrap(); + bus.read(0x11, &mut [0, 0, 0, 0]).unwrap(); + bus.write(0x11, &[0, 0, 0, 0]).unwrap(); + bus.read(0x16, &mut [0, 0, 0, 0]).unwrap(); + bus.write(0x16, &[0, 0, 0, 0]).unwrap(); + bus.read(0x20, &mut [0, 0, 0, 0]).unwrap_err(); + bus.write(0x20, &[0, 0, 0, 0]).unwrap_err(); + bus.read(0x06, &mut [0, 0, 0, 0]).unwrap_err(); + bus.write(0x06, &[0, 0, 0, 0]).unwrap_err(); + } + + #[test] + #[allow(clippy::redundant_clone)] + fn bus_read_write_values() { + let bus = Bus::new(); + let dummy = Arc::new(ConstantDevice); + bus.insert(dummy.clone(), 0x10, 0x10).unwrap(); + + let mut values = [0, 1, 2, 3]; + bus.read(0x10, &mut values).unwrap(); + assert_eq!(values, [0, 1, 2, 3]); + bus.write(0x10, &values).unwrap(); + bus.read(0x15, &mut values).unwrap(); + assert_eq!(values, [5, 6, 7, 8]); + bus.write(0x15, &values).unwrap(); + } + + #[test] + #[allow(clippy::redundant_clone)] + fn busrange_cmp() { + let range = BusRange { base: 0x10, len: 2 }; + assert_eq!(range, BusRange { base: 0x10, len: 3 }); + assert_eq!(range, BusRange { base: 0x10, len: 2 }); + + assert!(range < BusRange { base: 0x12, len: 1 }); + assert!(range < BusRange { base: 0x12, len: 3 }); + + assert_eq!(range, range.clone()); + + let bus = Bus::new(); + let mut data = [1, 2, 3, 4]; + let device = Arc::new(DummyDevice); + bus.insert(device.clone(), 0x10, 0x10).unwrap(); + bus.write(0x10, &data).unwrap(); + bus.read(0x10, &mut data).unwrap(); + assert_eq!(data, [1, 2, 3, 4]); + } + + #[test] + fn bus_range_overlap() { + let a = BusRange { + base: 0x1000, + len: 0x400, + }; + assert!(a.overlaps(0x1000, 0x400)); + assert!(a.overlaps(0xf00, 0x400)); + assert!(a.overlaps(0x1000, 0x01)); + assert!(a.overlaps(0xfff, 0x02)); + assert!(a.overlaps(0x1100, 0x100)); + assert!(a.overlaps(0x13ff, 0x100)); + assert!(!a.overlaps(0x1400, 0x100)); + assert!(!a.overlaps(0xf00, 0x100)); + } + + #[test] + fn bus_update_range() { + let bus = Bus::new(); + let dummy = Arc::new(DummyDevice); + + bus.update_range(0x13, 0x12, 0x16, 0x1).unwrap_err(); + bus.insert(dummy.clone(), 0x13, 12).unwrap(); + + bus.update_range(0x16, 0x1, 0x13, 0x12).unwrap_err(); + bus.update_range(0x13, 0x12, 0x16, 0x1).unwrap(); + } +} diff --git a/src/vm-device/src/interrupt/mod.rs b/src/vm-device/src/interrupt/mod.rs new file mode 100644 index 00000000000..da5d87a4e1a --- /dev/null +++ b/src/vm-device/src/interrupt/mod.rs @@ -0,0 +1,194 @@ +// Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// Copyright (C) 2019 Alibaba Cloud. All rights reserved. +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause + +//! Traits and Structs to manage interrupt sources for devices. +//! +//! In system programming, an interrupt is a signal to the processor emitted by hardware or +//! software indicating an event that needs immediate attention. An interrupt alerts the processor +//! to a high-priority condition requiring the interruption of the current code the processor is +//! executing. The processor responds by suspending its current activities, saving its state, and +//! executing a function called an interrupt handler (or an interrupt service routine, ISR) to deal +//! with the event. This interruption is temporary, and, after the interrupt handler finishes, +//! unless handling the interrupt has emitted a fatal error, the processor resumes normal +//! activities. +//! +//! Hardware interrupts are used by devices to communicate that they require attention from the +//! operating system, or a bare-metal program running on the CPU if there are no OSes. The act of +//! initiating a hardware interrupt is referred to as an interrupt request (IRQ). Different devices +//! are usually associated with different interrupts using a unique value associated with each +//! interrupt. This makes it possible to know which hardware device caused which interrupts. +//! These interrupt values are often called IRQ lines, or just interrupt lines. +//! +//! Nowadays, IRQ lines is not the only mechanism to deliver device interrupts to processors. +//! MSI [(Message Signaled Interrupt)](https://en.wikipedia.org/wiki/Message_Signaled_Interrupts) +//! is another commonly used alternative in-band method of signaling an interrupt, using special +//! in-band messages to replace traditional out-of-band assertion of dedicated interrupt lines. +//! While more complex to implement in a device, message signaled interrupts have some significant +//! advantages over pin-based out-of-band interrupt signaling. Message signaled interrupts are +//! supported in PCI bus since its version 2.2, and in later available PCI Express bus. Some +//! non-PCI architectures also use message signaled interrupts. +//! +//! While IRQ is a term commonly used by Operating Systems when dealing with hardware +//! interrupts, the IRQ numbers managed by OSes are independent of the ones managed by VMM. +//! For simplicity sake, the term `Interrupt Source` is used instead of IRQ to represent both +//! pin-based interrupts and MSI interrupts. +//! +//! A device may support multiple types of interrupts, and each type of interrupt may support one +//! or multiple interrupt sources. For example, a PCI device may support: +//! * Legacy Irq: exactly one interrupt source. +//! * PCI MSI Irq: 1,2,4,8,16,32 interrupt sources. +//! * PCI MSIx Irq: 2^n(n=0-11) interrupt sources. +//! +//! A distinct Interrupt Source Identifier (ISID) will be assigned to each interrupt source. +//! An ID allocator will be used to allocate and free Interrupt Source Identifiers for devices. +//! To decouple the vm-device crate from the ID allocator, the vm-device crate doesn't take the +//! responsibility to allocate/free Interrupt Source IDs but only makes use of assigned IDs. +//! +//! The overall flow to deal with interrupts is: +//! * The VMM creates an interrupt manager +//! * The VMM creates a device manager, passing on an reference to the interrupt manager +//! * The device manager passes on an reference to the interrupt manager to all registered devices +//! * The guest kernel loads drivers for virtual devices +//! * The guest device driver determines the type and number of interrupts needed, and update the +//! device configuration +//! * The virtual device backend requests the interrupt manager to create an interrupt group +//! according to guest configuration information + +use std::sync::Arc; + +use vmm_sys_util::eventfd::EventFd; + +/// Reuse std::io::Result to simplify interoperability among crates. +pub type Result = std::io::Result; + +/// Data type to store an interrupt source identifier. +pub type InterruptIndex = u32; + +/// Configuration data for legacy interrupts. +/// +/// On x86 platforms, legacy interrupts means those interrupts routed through PICs or IOAPICs. +#[derive(Copy, Clone, Debug)] +pub struct LegacyIrqSourceConfig { + pub irqchip: u32, + pub pin: u32, +} + +/// Configuration data for MSI/MSI-X interrupts. +/// +/// On x86 platforms, these interrupts are vectors delivered directly to the LAPIC. +#[derive(Copy, Clone, Debug, Default)] +pub struct MsiIrqSourceConfig { + /// High address to delivery message signaled interrupt. + pub high_addr: u32, + /// Low address to delivery message signaled interrupt. + pub low_addr: u32, + /// Data to write to delivery message signaled interrupt. + pub data: u32, + /// Unique ID of the device to delivery message signaled interrupt. + pub devid: u32, +} + +/// Configuration data for an interrupt source. +#[derive(Copy, Clone, Debug)] +pub enum InterruptSourceConfig { + /// Configuration data for Legacy interrupts. + LegacyIrq(LegacyIrqSourceConfig), + /// Configuration data for PciMsi, PciMsix and generic MSI interrupts. + MsiIrq(MsiIrqSourceConfig), +} + +/// Configuration data for legacy, pin based interrupt groups. +/// +/// A legacy interrupt group only takes one irq number as its configuration. +#[derive(Copy, Clone, Debug)] +pub struct LegacyIrqGroupConfig { + /// Legacy irq number. + pub irq: InterruptIndex, +} + +/// Configuration data for MSI/MSI-X interrupt groups +/// +/// MSI/MSI-X interrupt groups are basically a set of vectors. +#[derive(Copy, Clone, Debug)] +pub struct MsiIrqGroupConfig { + /// First index of the MSI/MSI-X interrupt vectors + pub base: InterruptIndex, + /// Number of vectors in the MSI/MSI-X group. + pub count: InterruptIndex, +} + +/// Trait to manage interrupt sources for virtual device backends. +/// +/// The InterruptManager implementations should protect itself from concurrent accesses internally, +/// so it could be invoked from multi-threaded context. +pub trait InterruptManager: Send + Sync { + type GroupConfig; + + /// Create an [InterruptSourceGroup](trait.InterruptSourceGroup.html) object to manage + /// interrupt sources for a virtual device + /// + /// An [InterruptSourceGroup](trait.InterruptSourceGroup.html) object manages all interrupt + /// sources of the same type for a virtual device. + /// + /// # Arguments + /// * interrupt_type: type of interrupt source. + /// * base: base Interrupt Source ID to be managed by the group object. + /// * count: number of Interrupt Sources to be managed by the group object. + fn create_group(&self, config: Self::GroupConfig) -> Result>; + + /// Destroy an [InterruptSourceGroup](trait.InterruptSourceGroup.html) object created by + /// [create_group()](trait.InterruptManager.html#tymethod.create_group). + /// + /// Assume the caller takes the responsibility to disable all interrupt sources of the group + /// before calling destroy_group(). This assumption helps to simplify InterruptSourceGroup + /// implementations. + fn destroy_group(&self, group: Arc) -> Result<()>; +} + +pub trait InterruptSourceGroup: Send + Sync { + /// Enable the interrupt sources in the group to generate interrupts. + fn enable(&self) -> Result<()> { + // Not all interrupt sources can be enabled. + // To accommodate this, we can have a no-op here. + Ok(()) + } + + /// Disable the interrupt sources in the group to generate interrupts. + fn disable(&self) -> Result<()> { + // Not all interrupt sources can be disabled. + // To accommodate this, we can have a no-op here. + Ok(()) + } + + /// Inject an interrupt from this interrupt source into the guest. + fn trigger(&self, index: InterruptIndex) -> Result<()>; + + /// Returns an interrupt notifier from this interrupt. + /// + /// An interrupt notifier allows for external components and processes + /// to inject interrupts into a guest, by writing to the file returned + /// by this method. + #[allow(unused_variables)] + fn notifier(&self, index: InterruptIndex) -> Option<&EventFd>; + + /// Update the interrupt source group configuration. + /// + /// # Arguments + /// * index: sub-index into the group. + /// * config: configuration data for the interrupt source. + /// * masked: if the interrupt is masked + /// * set_gsi: whether update the GSI routing table. + fn update( + &self, + index: InterruptIndex, + config: InterruptSourceConfig, + masked: bool, + set_gsi: bool, + ) -> Result<()>; + + /// Set the interrupt group GSI routing table. + fn set_gsi(&self) -> Result<()>; +} diff --git a/src/vm-device/src/lib.rs b/src/vm-device/src/lib.rs new file mode 100644 index 00000000000..b980b09c4b9 --- /dev/null +++ b/src/vm-device/src/lib.rs @@ -0,0 +1,62 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Copyright © 2020 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +use serde::{Deserialize, Serialize}; + +mod bus; +pub mod interrupt; + +pub use self::bus::{Bus, BusDevice, BusDeviceSync, Error as BusError}; + +/// Type of Message Signalled Interrupt +#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub enum MsiIrqType { + /// PCI MSI IRQ numbers. + PciMsi, + /// PCI MSIx IRQ numbers. + PciMsix, + /// Generic MSI IRQ numbers. + GenericMsi, +} + +#[derive(Copy, Clone, PartialEq, Eq, Serialize, Deserialize, Debug)] +pub enum PciBarType { + Io, + Mmio32, + Mmio64, +} + +/// Enumeration for device resources. +#[allow(missing_docs)] +#[derive(Clone, Debug, Serialize, Deserialize)] +pub enum Resource { + /// IO Port address range. + PioAddressRange { base: u16, size: u16 }, + /// Memory Mapped IO address range. + MmioAddressRange { base: u64, size: u64 }, + /// PCI BAR + PciBar { + index: usize, + base: u64, + size: u64, + type_: PciBarType, + prefetchable: bool, + }, + /// Legacy IRQ number. + LegacyIrq(u32), + /// Message Signaled Interrupt + MsiIrq { + ty: MsiIrqType, + base: u32, + size: u32, + }, + /// Network Interface Card MAC address. + MacAddress(String), + /// KVM memslot index. + KvmMemSlot(u32), +} diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index 98d95a615c2..d6a112a268a 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -14,15 +14,21 @@ tracing = ["log-instrument"] gdb = ["arrayvec", "gdbstub", "gdbstub_arch"] [dependencies] + acpi_tables = { path = "../acpi-tables" } aes-gcm = { version = "0.10.1", default-features = false, features = ["aes"] } +anyhow = "1.0.98" arrayvec = { version = "0.7.6", optional = true } aws-lc-rs = { version = "1.13.3", features = ["bindgen"] } base64 = "0.22.1" bincode = { version = "2.0.1", features = ["serde"] } bitflags = "2.9.1" +byteorder = "1.5.0" crc64 = "2.0.0" -derive_more = { version = "2.0.1", default-features = false, features = ["from", "display"] } +derive_more = { version = "2.0.1", default-features = false, features = [ + "from", + "display", +] } displaydoc = "0.2.5" event-manager = "0.4.1" gdbstub = { version = "0.7.6", optional = true } @@ -35,6 +41,7 @@ log = { version = "0.4.27", features = ["std", "serde"] } log-instrument = { path = "../log-instrument", optional = true } memfd = "0.6.3" micro_http = { git = "https://github.com/firecracker-microvm/micro-http" } +pci = { path = "../pci" } semver = { version = "1.0.26", features = ["serde"] } serde = { version = "1.0.219", features = ["derive", "rc"] } serde_json = "1.0.142" @@ -43,9 +50,14 @@ thiserror = "2.0.12" timerfd = "1.5.0" userfaultfd = "0.8.1" utils = { path = "../utils" } +uuid = "1.16.0" vhost = { version = "0.14.0", features = ["vhost-user-frontend"] } -vm-allocator = "0.1.3" -vm-memory = { version = "0.16.2", features = ["backend-mmap", "backend-bitmap"] } +vm-allocator = { version = "0.1.3", features = ["serde"] } +vm-device = { path = "../vm-device" } +vm-memory = { version = "0.16.2", features = [ + "backend-mmap", + "backend-bitmap", +] } vm-superio = "0.8.0" vmm-sys-util = { version = "0.14.0", features = ["with-serde"] } zerocopy = { version = "0.8.26" } diff --git a/src/vmm/src/acpi/mod.rs b/src/vmm/src/acpi/mod.rs index 0b5c5edcbde..f3b4164745a 100644 --- a/src/vmm/src/acpi/mod.rs +++ b/src/vmm/src/acpi/mod.rs @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use acpi_tables::fadt::{FADT_F_HW_REDUCED_ACPI, FADT_F_PWR_BUTTON, FADT_F_SLP_BUTTON}; -use acpi_tables::{Aml, Dsdt, Fadt, Madt, Rsdp, Sdt, Xsdt, aml}; +use acpi_tables::{Aml, Dsdt, Fadt, Madt, Mcfg, Rsdp, Sdt, Xsdt, aml}; use log::{debug, error}; use vm_allocator::AllocPolicy; @@ -10,10 +10,10 @@ use crate::Vcpu; use crate::acpi::x86_64::{ apic_addr, rsdp_addr, setup_arch_dsdt, setup_arch_fadt, setup_interrupt_controllers, }; -use crate::device_manager::acpi::ACPIDeviceManager; -use crate::device_manager::mmio::MMIODeviceManager; -use crate::device_manager::resources::ResourceAllocator; +use crate::arch::x86_64::layout; +use crate::device_manager::DeviceManager; use crate::vstate::memory::{GuestAddress, GuestMemoryMmap}; +use crate::vstate::resources::ResourceAllocator; mod x86_64; @@ -45,7 +45,6 @@ pub enum AcpiError { /// allocator for allocating space for the tables struct AcpiTableWriter<'a> { mem: &'a GuestMemoryMmap, - resource_allocator: &'a mut ResourceAllocator, } impl AcpiTableWriter<'_> { @@ -53,11 +52,15 @@ impl AcpiTableWriter<'_> { /// /// This will allocate enough space inside guest memory and write the table in the allocated /// buffer. It returns the address in which it wrote the table. - fn write_acpi_table(&mut self, table: &mut S) -> Result + fn write_acpi_table( + &mut self, + resource_allocator: &mut ResourceAllocator, + table: &mut S, + ) -> Result where S: Sdt, { - let addr = self.resource_allocator.allocate_system_memory( + let addr = resource_allocator.allocate_system_memory( table.len().try_into().unwrap(), 1, AllocPolicy::FirstMatch, @@ -79,28 +82,38 @@ impl AcpiTableWriter<'_> { /// Build the DSDT table for the guest fn build_dsdt( &mut self, - mmio_device_manager: &MMIODeviceManager, - acpi_device_manager: &ACPIDeviceManager, + device_manager: &mut DeviceManager, + resource_allocator: &mut ResourceAllocator, ) -> Result { let mut dsdt_data = Vec::new(); // Virtio-devices DSDT data - dsdt_data.extend_from_slice(&mmio_device_manager.dsdt_data); + dsdt_data.extend_from_slice(&device_manager.mmio_devices.dsdt_data); // Add GED and VMGenID AML data. - acpi_device_manager.append_aml_bytes(&mut dsdt_data)?; + device_manager + .acpi_devices + .append_aml_bytes(&mut dsdt_data)?; + + if let Some(pci_segment) = &device_manager.pci_devices.pci_segment { + pci_segment.append_aml_bytes(&mut dsdt_data)?; + } // Architecture specific DSDT data setup_arch_dsdt(&mut dsdt_data)?; let mut dsdt = Dsdt::new(OEM_ID, *b"FCVMDSDT", OEM_REVISION, dsdt_data); - self.write_acpi_table(&mut dsdt) + self.write_acpi_table(resource_allocator, &mut dsdt) } /// Build the FADT table for the guest /// /// This includes a pointer with the location of the DSDT in guest memory - fn build_fadt(&mut self, dsdt_addr: u64) -> Result { + fn build_fadt( + &mut self, + resource_allocator: &mut ResourceAllocator, + dsdt_addr: u64, + ) -> Result { let mut fadt = Fadt::new(OEM_ID, *b"FCVMFADT", OEM_REVISION); fadt.set_hypervisor_vendor_id(HYPERVISOR_VENDOR_ID); fadt.set_x_dsdt(dsdt_addr); @@ -108,13 +121,17 @@ impl AcpiTableWriter<'_> { (1 << FADT_F_HW_REDUCED_ACPI) | (1 << FADT_F_PWR_BUTTON) | (1 << FADT_F_SLP_BUTTON), ); setup_arch_fadt(&mut fadt); - self.write_acpi_table(&mut fadt) + self.write_acpi_table(resource_allocator, &mut fadt) } /// Build the MADT table for the guest /// /// This includes information about the interrupt controllers supported in the platform - fn build_madt(&mut self, nr_vcpus: u8) -> Result { + fn build_madt( + &mut self, + resource_allocator: &mut ResourceAllocator, + nr_vcpus: u8, + ) -> Result { let mut madt = Madt::new( OEM_ID, *b"FCVMMADT", @@ -122,20 +139,36 @@ impl AcpiTableWriter<'_> { apic_addr(), setup_interrupt_controllers(nr_vcpus), ); - self.write_acpi_table(&mut madt) + self.write_acpi_table(resource_allocator, &mut madt) } /// Build the XSDT table for the guest /// /// Currently, we pass to the guest just FADT and MADT tables. - fn build_xsdt(&mut self, fadt_addr: u64, madt_addr: u64) -> Result { + fn build_xsdt( + &mut self, + resource_allocator: &mut ResourceAllocator, + fadt_addr: u64, + madt_addr: u64, + mcfg_addr: u64, + ) -> Result { let mut xsdt = Xsdt::new( OEM_ID, *b"FCMVXSDT", OEM_REVISION, - vec![fadt_addr, madt_addr], + vec![fadt_addr, madt_addr, mcfg_addr], ); - self.write_acpi_table(&mut xsdt) + self.write_acpi_table(resource_allocator, &mut xsdt) + } + + /// Build the MCFG table for the guest. + fn build_mcfg( + &mut self, + resource_allocator: &mut ResourceAllocator, + pci_mmio_config_addr: u64, + ) -> Result { + let mut mcfg = Mcfg::new(OEM_ID, *b"FCMVMCFG", OEM_REVISION, pci_mmio_config_addr); + self.write_acpi_table(resource_allocator, &mut mcfg) } /// Build the RSDP pointer for the guest. @@ -163,20 +196,17 @@ impl AcpiTableWriter<'_> { /// such as interrupt controllers, vCPUs and VirtIO devices. pub(crate) fn create_acpi_tables( mem: &GuestMemoryMmap, + device_manager: &mut DeviceManager, resource_allocator: &mut ResourceAllocator, - mmio_device_manager: &MMIODeviceManager, - acpi_device_manager: &ACPIDeviceManager, vcpus: &[Vcpu], ) -> Result<(), AcpiError> { - let mut writer = AcpiTableWriter { - mem, - resource_allocator, - }; - - let dsdt_addr = writer.build_dsdt(mmio_device_manager, acpi_device_manager)?; - let fadt_addr = writer.build_fadt(dsdt_addr)?; - let madt_addr = writer.build_madt(vcpus.len().try_into().unwrap())?; - let xsdt_addr = writer.build_xsdt(fadt_addr, madt_addr)?; + let mut writer = AcpiTableWriter { mem }; + let dsdt_addr = writer.build_dsdt(device_manager, resource_allocator)?; + + let fadt_addr = writer.build_fadt(resource_allocator, dsdt_addr)?; + let madt_addr = writer.build_madt(resource_allocator, vcpus.len().try_into().unwrap())?; + let mcfg_addr = writer.build_mcfg(resource_allocator, layout::PCI_MMCONFIG_START)?; + let xsdt_addr = writer.build_xsdt(resource_allocator, fadt_addr, madt_addr, mcfg_addr)?; writer.build_rsdp(xsdt_addr) } @@ -188,8 +218,8 @@ mod tests { use crate::acpi::{AcpiError, AcpiTableWriter}; use crate::arch::x86_64::layout::{SYSTEM_MEM_SIZE, SYSTEM_MEM_START}; use crate::builder::tests::default_vmm; - use crate::device_manager::resources::ResourceAllocator; use crate::utils::u64_to_usize; + use crate::vstate::resources::ResourceAllocator; use crate::vstate::vm::tests::setup_vm_with_memory; struct MockSdt(Vec); @@ -215,20 +245,24 @@ mod tests { #[test] fn test_write_acpi_table_memory_allocation() { // A mocke Vmm object with 128MBs of memory - let mut vmm = default_vmm(); + let vmm = default_vmm(); let mut writer = AcpiTableWriter { mem: vmm.vm.guest_memory(), - resource_allocator: &mut vmm.resource_allocator, }; + let mut resource_allocator = vmm.vm.resource_allocator(); // This should succeed let mut sdt = MockSdt(vec![0; 4096]); - let addr = writer.write_acpi_table(&mut sdt).unwrap(); + let addr = writer + .write_acpi_table(&mut resource_allocator, &mut sdt) + .unwrap(); assert_eq!(addr, SYSTEM_MEM_START); // Let's try to write two 4K pages plus one byte let mut sdt = MockSdt(vec![0; usize::try_from(SYSTEM_MEM_SIZE + 1).unwrap()]); - let err = writer.write_acpi_table(&mut sdt).unwrap_err(); + let err = writer + .write_acpi_table(&mut resource_allocator, &mut sdt) + .unwrap_err(); assert!( matches!( err, @@ -241,19 +275,29 @@ mod tests { // We are allocating memory for tables with alignment of 1 byte. All of these should // succeed. let mut sdt = MockSdt(vec![0; 5]); - let addr = writer.write_acpi_table(&mut sdt).unwrap(); + let addr = writer + .write_acpi_table(&mut resource_allocator, &mut sdt) + .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4096); let mut sdt = MockSdt(vec![0; 2]); - let addr = writer.write_acpi_table(&mut sdt).unwrap(); + let addr = writer + .write_acpi_table(&mut resource_allocator, &mut sdt) + .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4101); let mut sdt = MockSdt(vec![0; 4]); - let addr = writer.write_acpi_table(&mut sdt).unwrap(); + let addr = writer + .write_acpi_table(&mut resource_allocator, &mut sdt) + .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4103); let mut sdt = MockSdt(vec![0; 8]); - let addr = writer.write_acpi_table(&mut sdt).unwrap(); + let addr = writer + .write_acpi_table(&mut resource_allocator, &mut sdt) + .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4107); let mut sdt = MockSdt(vec![0; 16]); - let addr = writer.write_acpi_table(&mut sdt).unwrap(); + let addr = writer + .write_acpi_table(&mut resource_allocator, &mut sdt) + .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4115); } @@ -268,11 +312,13 @@ mod tests { let (_, vm) = setup_vm_with_memory(u64_to_usize(SYSTEM_MEM_START + SYSTEM_MEM_SIZE - 4096)); let mut writer = AcpiTableWriter { mem: vm.guest_memory(), - resource_allocator: &mut ResourceAllocator::new().unwrap(), }; + let mut resource_allocator = ResourceAllocator::new(); let mut sdt = MockSdt(vec![0; usize::try_from(SYSTEM_MEM_SIZE).unwrap()]); - let err = writer.write_acpi_table(&mut sdt).unwrap_err(); + let err = writer + .write_acpi_table(&mut resource_allocator, &mut sdt) + .unwrap_err(); assert!( matches!( err, diff --git a/src/vmm/src/acpi/x86_64.rs b/src/vmm/src/acpi/x86_64.rs index de850a9989f..53eeac7b5e2 100644 --- a/src/vmm/src/acpi/x86_64.rs +++ b/src/vmm/src/acpi/x86_64.rs @@ -3,10 +3,7 @@ use std::mem::size_of; -use acpi_tables::fadt::{ - IAPC_BOOT_ARG_FLAGS_MSI_NOT_PRESENT, IAPC_BOOT_ARG_FLAGS_PCI_ASPM, - IAPC_BOOT_ARG_FLAGS_VGA_NOT_PRESENT, -}; +use acpi_tables::fadt::IAPC_BOOT_ARG_FLAGS_VGA_NOT_PRESENT; use acpi_tables::madt::{IoAPIC, LocalAPIC}; use acpi_tables::{Fadt, aml}; use vm_memory::GuestAddress; @@ -33,11 +30,7 @@ pub(crate) fn setup_arch_fadt(fadt: &mut Fadt) { // neither do we support ASPM, or MSI type of interrupts. // More info here: // https://uefi.org/specs/ACPI/6.5/05_ACPI_Software_Programming_Model.html?highlight=0a06#ia-pc-boot-architecture-flags - fadt.setup_iapc_flags( - (1 << IAPC_BOOT_ARG_FLAGS_VGA_NOT_PRESENT) - | (1 << IAPC_BOOT_ARG_FLAGS_PCI_ASPM) - | (1 << IAPC_BOOT_ARG_FLAGS_MSI_NOT_PRESENT), - ); + fadt.setup_iapc_flags(1 << IAPC_BOOT_ARG_FLAGS_VGA_NOT_PRESENT); } #[inline(always)] diff --git a/src/vmm/src/arch/aarch64/fdt.rs b/src/vmm/src/arch/aarch64/fdt.rs index 7d7f7d748a9..6a50c0257a9 100644 --- a/src/vmm/src/arch/aarch64/fdt.rs +++ b/src/vmm/src/arch/aarch64/fdt.rs @@ -5,17 +5,21 @@ // Use of this source code is governed by a BSD-style license that can be // found in the THIRD-PARTY file. -use std::collections::HashMap; use std::ffi::CString; use std::fmt::Debug; use vm_fdt::{Error as VmFdtError, FdtWriter, FdtWriterNode}; use vm_memory::GuestMemoryError; -use super::super::DeviceType; use super::cache_info::{CacheEntry, read_cache_config}; use super::gic::GICDevice; +use crate::arch::{ + MEM_32BIT_DEVICES_SIZE, MEM_32BIT_DEVICES_START, MEM_64BIT_DEVICES_SIZE, + MEM_64BIT_DEVICES_START, PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, +}; +use crate::device_manager::DeviceManager; use crate::device_manager::mmio::MMIODeviceInfo; +use crate::device_manager::pci_mngr::PciDevices; use crate::devices::acpi::vmgenid::{VMGENID_MEM_SIZE, VmGenId}; use crate::initrd::InitrdConfig; use crate::vstate::memory::{Address, GuestMemory, GuestMemoryMmap}; @@ -24,6 +28,8 @@ use crate::vstate::memory::{Address, GuestMemory, GuestMemoryMmap}; const GIC_PHANDLE: u32 = 1; // This is a value for uniquely identifying the FDT node containing the clock definition. const CLOCK_PHANDLE: u32 = 2; +// This is a value for uniquely identifying the FDT node declaring the MSI controller. +const MSI_PHANDLE: u32 = 3; // You may be wondering why this big value? // This phandle is used to uniquely identify the FDT nodes containing cache information. Each cpu // can have a variable number of caches, some of these caches may be shared with other cpus. @@ -55,14 +61,14 @@ pub enum FdtError { WriteFdtToMemory(#[from] GuestMemoryError), } +#[allow(clippy::too_many_arguments)] /// Creates the flattened device tree for this aarch64 microVM. pub fn create_fdt( guest_mem: &GuestMemoryMmap, vcpu_mpidr: Vec, cmdline: CString, - device_info: &HashMap<(DeviceType, String), MMIODeviceInfo>, + device_manager: &DeviceManager, gic_device: &GICDevice, - vmgenid: &Option, initrd: &Option, ) -> Result, FdtError> { // Allocate stuff necessary for storing the blob. @@ -89,8 +95,9 @@ pub fn create_fdt( create_timer_node(&mut fdt_writer)?; create_clock_node(&mut fdt_writer)?; create_psci_node(&mut fdt_writer)?; - create_devices_node(&mut fdt_writer, device_info)?; - create_vmgenid_node(&mut fdt_writer, vmgenid)?; + create_devices_node(&mut fdt_writer, device_manager)?; + create_vmgenid_node(&mut fdt_writer, &device_manager.acpi_devices.vmgenid)?; + create_pci_nodes(&mut fdt_writer, &device_manager.pci_devices)?; // End Header node. fdt_writer.end_node(root)?; @@ -297,6 +304,16 @@ fn create_gic_node(fdt: &mut FdtWriter, gic_device: &GICDevice) -> Result<(), Fd ]; fdt.property_array_u32("interrupts", &gic_intr)?; + + if let Some(msi_properties) = gic_device.msi_properties() { + let msic_node = fdt.begin_node("msic")?; + fdt.property_string("compatible", "arm,gic-v3-its")?; + fdt.property_null("msi-controller")?; + fdt.property_u32("phandle", MSI_PHANDLE)?; + fdt.property_array_u64("reg", msi_properties)?; + fdt.end_node(msic_node)?; + } + fdt.end_node(interrupt)?; Ok(()) @@ -362,7 +379,7 @@ fn create_virtio_node(fdt: &mut FdtWriter, dev_info: &MMIODeviceInfo) -> Result< "interrupts", &[ GIC_FDT_IRQ_TYPE_SPI, - dev_info.irq.unwrap(), + dev_info.gsi.unwrap(), IRQ_TYPE_EDGE_RISING, ], )?; @@ -383,7 +400,7 @@ fn create_serial_node(fdt: &mut FdtWriter, dev_info: &MMIODeviceInfo) -> Result< "interrupts", &[ GIC_FDT_IRQ_TYPE_SPI, - dev_info.irq.unwrap(), + dev_info.gsi.unwrap(), IRQ_TYPE_EDGE_RISING, ], )?; @@ -411,45 +428,117 @@ fn create_rtc_node(fdt: &mut FdtWriter, dev_info: &MMIODeviceInfo) -> Result<(), fn create_devices_node( fdt: &mut FdtWriter, - dev_info: &HashMap<(DeviceType, String), MMIODeviceInfo>, + device_manager: &DeviceManager, ) -> Result<(), FdtError> { - // Create one temp Vec to store all virtio devices - let mut ordered_virtio_device: Vec<&MMIODeviceInfo> = Vec::new(); - - for ((device_type, _device_id), info) in dev_info { - match device_type { - DeviceType::BootTimer => (), // since it's not a real device - DeviceType::Rtc => create_rtc_node(fdt, info)?, - DeviceType::Serial => create_serial_node(fdt, info)?, - DeviceType::Virtio(_) => { - ordered_virtio_device.push(info); - } - } + if let Some(rtc_info) = device_manager.mmio_devices.rtc_device_info() { + create_rtc_node(fdt, rtc_info)?; + } + + if let Some(serial_info) = device_manager.mmio_devices.serial_device_info() { + create_serial_node(fdt, serial_info)?; } + let mut virtio_mmio = device_manager.mmio_devices.virtio_device_info(); + // Sort out virtio devices by address from low to high and insert them into fdt table. - ordered_virtio_device.sort_by_key(|a| a.addr); - for ordered_device_info in ordered_virtio_device.drain(..) { + virtio_mmio.sort_by_key(|a| a.addr); + for ordered_device_info in virtio_mmio.drain(..) { create_virtio_node(fdt, ordered_device_info)?; } Ok(()) } +fn create_pci_nodes(fdt: &mut FdtWriter, pci_devices: &PciDevices) -> Result<(), FdtError> { + if pci_devices.pci_segment.is_none() { + return Ok(()); + } + + // Fine to unwrap here, we just checked it's not `None`. + let segment = pci_devices.pci_segment.as_ref().unwrap(); + + let pci_node_name = format!("pci@{:x}", segment.mmio_config_address); + // Each range here is a thruple of `(PCI address, CPU address, PCI size)`. + // + // More info about the format can be found here: + // https://elinux.org/Device_Tree_Usage#PCI_Address_Translation + let ranges = [ + // 32bit addresses + 0x200_0000u32, + (MEM_32BIT_DEVICES_START >> 32) as u32, // PCI address + (MEM_32BIT_DEVICES_START & 0xffff_ffff) as u32, + (MEM_32BIT_DEVICES_START >> 32) as u32, // CPU address + (MEM_32BIT_DEVICES_START & 0xffff_ffff) as u32, + (MEM_32BIT_DEVICES_SIZE >> 32) as u32, // Range size + (MEM_32BIT_DEVICES_SIZE & 0xffff_ffff) as u32, + // 64bit addresses + 0x300_0000u32, + // PCI address + (MEM_64BIT_DEVICES_START >> 32) as u32, // PCI address + (MEM_64BIT_DEVICES_START & 0xffff_ffff) as u32, + // CPU address + (MEM_64BIT_DEVICES_START >> 32) as u32, // CPU address + (MEM_64BIT_DEVICES_START & 0xffff_ffff) as u32, + // Range size + (MEM_64BIT_DEVICES_SIZE >> 32) as u32, // Range size + ((MEM_64BIT_DEVICES_SIZE & 0xffff_ffff) >> 32) as u32, + ]; + + // See kernel document Documentation/devicetree/bindings/pci/pci-msi.txt + let msi_map = [ + // rid-base: A single cell describing the first RID matched by the entry. + 0x0, + // msi-controller: A single phandle to an MSI controller. + MSI_PHANDLE, + // msi-base: An msi-specifier describing the msi-specifier produced for the + // first RID matched by the entry. + segment.id as u32, + // length: A single cell describing how many consecutive RIDs are matched + // following the rid-base. + 0x100, + ]; + + let pci_node = fdt.begin_node(&pci_node_name)?; + + fdt.property_string("compatible", "pci-host-ecam-generic")?; + fdt.property_string("device_type", "pci")?; + fdt.property_array_u32("ranges", &ranges)?; + fdt.property_array_u32("bus-range", &[0, 0])?; + fdt.property_u32("linux,pci-domain", segment.id.into())?; + fdt.property_u32("#address-cells", 3)?; + fdt.property_u32("#size-cells", 2)?; + fdt.property_array_u64( + "reg", + &[ + segment.mmio_config_address, + PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, + ], + )?; + fdt.property_u32("#interrupt-cells", 1)?; + fdt.property_null("interrupt-map")?; + fdt.property_null("interrupt-map-mask")?; + fdt.property_null("dma-coherent")?; + fdt.property_array_u32("msi-map", &msi_map)?; + fdt.property_u32("msi-parent", MSI_PHANDLE)?; + + Ok(fdt.end_node(pci_node)?) +} + #[cfg(test)] mod tests { use std::ffi::CString; + use std::sync::{Arc, Mutex}; - use kvm_ioctls::Kvm; + use linux_loader::cmdline as kernel_cmdline; use super::*; use crate::arch::aarch64::gic::create_gic; use crate::arch::aarch64::layout; - use crate::device_manager::resources::ResourceAllocator; + use crate::device_manager::mmio::tests::DummyDevice; + use crate::device_manager::tests::default_device_manager; use crate::test_utils::arch_mem; use crate::vstate::memory::GuestAddress; - - const LEN: u64 = 4096; + use crate::{EventManager, Kvm, Vm}; // The `load` function from the `device_tree` will mistakenly check the actual size // of the buffer with the allocated size. This works around that. @@ -463,47 +552,30 @@ mod tests { #[test] fn test_create_fdt_with_devices() { let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); + let mut event_manager = EventManager::new().unwrap(); + let mut device_manager = default_device_manager(); + let kvm = Kvm::new(vec![]).unwrap(); + let vm = Vm::new(&kvm).unwrap(); + let gic = create_gic(vm.fd(), 1, None).unwrap(); + let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); + cmdline.insert("console", "/dev/tty0").unwrap(); + + device_manager + .attach_legacy_devices_aarch64(&vm, &mut event_manager, &mut cmdline) + .unwrap(); + let dummy = Arc::new(Mutex::new(DummyDevice::new())); + device_manager + .mmio_devices + .register_virtio_test_device(&vm, mem.clone(), dummy, &mut cmdline, "dummy") + .unwrap(); - let dev_info: HashMap<(DeviceType, std::string::String), MMIODeviceInfo> = [ - ( - (DeviceType::Serial, DeviceType::Serial.to_string()), - MMIODeviceInfo { - addr: 0x00, - irq: Some(1u32), - len: LEN, - }, - ), - ( - (DeviceType::Virtio(1), "virtio".to_string()), - MMIODeviceInfo { - addr: LEN, - irq: Some(2u32), - len: LEN, - }, - ), - ( - (DeviceType::Rtc, "rtc".to_string()), - MMIODeviceInfo { - addr: 2 * LEN, - irq: Some(3u32), - len: LEN, - }, - ), - ] - .iter() - .cloned() - .collect(); - let kvm = Kvm::new().unwrap(); - let vm = kvm.create_vm().unwrap(); - let gic = create_gic(&vm, 1, None).unwrap(); create_fdt( &mem, vec![0], - CString::new("console=tty0").unwrap(), - &dev_info, + cmdline.as_cstring().unwrap(), + &device_manager, &gic, &None, - &None, ) .unwrap(); } @@ -511,18 +583,21 @@ mod tests { #[test] fn test_create_fdt_with_vmgenid() { let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); - let mut resource_allocator = ResourceAllocator::new().unwrap(); - let vmgenid = VmGenId::new(&mem, &mut resource_allocator).unwrap(); - let kvm = Kvm::new().unwrap(); - let vm = kvm.create_vm().unwrap(); - let gic = create_gic(&vm, 1, None).unwrap(); + let mut device_manager = default_device_manager(); + let kvm = Kvm::new(vec![]).unwrap(); + let vm = Vm::new(&kvm).unwrap(); + let gic = create_gic(vm.fd(), 1, None).unwrap(); + let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); + cmdline.insert("console", "/dev/tty0").unwrap(); + + device_manager.attach_vmgenid_device(&mem, &vm).unwrap(); + create_fdt( &mem, vec![0], CString::new("console=tty0").unwrap(), - &HashMap::<(DeviceType, std::string::String), MMIODeviceInfo>::new(), + &device_manager, &gic, - &Some(vmgenid), &None, ) .unwrap(); @@ -531,9 +606,10 @@ mod tests { #[test] fn test_create_fdt() { let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); - let kvm = Kvm::new().unwrap(); - let vm = kvm.create_vm().unwrap(); - let gic = create_gic(&vm, 1, None).unwrap(); + let device_manager = default_device_manager(); + let kvm = Kvm::new(vec![]).unwrap(); + let vm = Vm::new(&kvm).unwrap(); + let gic = create_gic(vm.fd(), 1, None).unwrap(); let saved_dtb_bytes = match gic.fdt_compatibility() { "arm,gic-v3" => include_bytes!("output_GICv3.dtb"), @@ -545,10 +621,9 @@ mod tests { &mem, vec![0], CString::new("console=tty0").unwrap(), - &HashMap::<(DeviceType, std::string::String), MMIODeviceInfo>::new(), + &device_manager, &gic, &None, - &None, ) .unwrap(); @@ -588,9 +663,10 @@ mod tests { #[test] fn test_create_fdt_with_initrd() { let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); - let kvm = Kvm::new().unwrap(); - let vm = kvm.create_vm().unwrap(); - let gic = create_gic(&vm, 1, None).unwrap(); + let device_manager = default_device_manager(); + let kvm = Kvm::new(vec![]).unwrap(); + let vm = Vm::new(&kvm).unwrap(); + let gic = create_gic(vm.fd(), 1, None).unwrap(); let saved_dtb_bytes = match gic.fdt_compatibility() { "arm,gic-v3" => include_bytes!("output_initrd_GICv3.dtb"), @@ -607,9 +683,8 @@ mod tests { &mem, vec![0], CString::new("console=tty0").unwrap(), - &HashMap::<(DeviceType, std::string::String), MMIODeviceInfo>::new(), + &device_manager, &gic, - &None, &Some(initrd), ) .unwrap(); diff --git a/src/vmm/src/arch/aarch64/gic/gicv2/mod.rs b/src/vmm/src/arch/aarch64/gic/gicv2/mod.rs index 22aaa4b4b74..01fd4b4d73d 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv2/mod.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv2/mod.rs @@ -30,7 +30,7 @@ impl GICv2 { /// Get the address of the GICv2 distributor. const fn get_dist_addr() -> u64 { - super::layout::MAPPED_IO_START - GICv2::KVM_VGIC_V2_DIST_SIZE + super::layout::MMIO32_MEM_START - GICv2::KVM_VGIC_V2_DIST_SIZE } /// Get the size of the GIC_v2 distributor. @@ -68,7 +68,9 @@ impl GICv2 { GICv2::get_cpu_addr(), GICv2::get_cpu_size(), ], + msi_properties: None, vcpu_count, + its_device: None, }) } @@ -82,7 +84,7 @@ impl GICv2 { pub fn init_device_attributes(gic_device: &Self) -> Result<(), GicError> { // Setting up the distributor attribute. - // We are placing the GIC below 1GB so we need to substract the size of the distributor. + // We are placing the GIC below 1GB so we need to subtract the size of the distributor. Self::set_device_attribute( gic_device.device_fd(), kvm_bindings::KVM_DEV_ARM_VGIC_GRP_ADDR, @@ -133,9 +135,9 @@ impl GICv2 { // On arm there are 3 types of interrupts: SGI (0-15), PPI (16-31), SPI (32-1020). // SPIs are used to signal interrupts from various peripherals accessible across // the whole system so these are the ones that we increment when adding a new virtio device. - // KVM_DEV_ARM_VGIC_GRP_NR_IRQS sets the highest SPI number. Consequently, we will have a - // total of `super::layout::IRQ_MAX - 32` usable SPIs in our microVM. - let nr_irqs: u32 = super::layout::IRQ_MAX; + // KVM_DEV_ARM_VGIC_GRP_NR_IRQS sets the number of interrupts (SGI, PPI, and SPI). + // Consequently, we need to add 32 to the number of SPIs ("legacy GSI"). + let nr_irqs: u32 = crate::arch::GSI_LEGACY_NUM + super::layout::SPI_START; let nr_irqs_ptr = &nr_irqs as *const u32; Self::set_device_attribute( gic_device.device_fd(), diff --git a/src/vmm/src/arch/aarch64/gic/gicv2/regs/dist_regs.rs b/src/vmm/src/arch/aarch64/gic/gicv2/regs/dist_regs.rs index 21a404b302b..09a33a4a1ff 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv2/regs/dist_regs.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv2/regs/dist_regs.rs @@ -8,7 +8,7 @@ use kvm_ioctls::DeviceFd; use crate::arch::aarch64::gic::GicError; use crate::arch::aarch64::gic::regs::{GicRegState, MmioReg, SimpleReg, VgicRegEngine}; -use crate::arch::{IRQ_BASE, IRQ_MAX}; +use crate::arch::{GSI_LEGACY_NUM, SPI_START}; // Distributor registers as detailed at page 75 from // https://developer.arm.com/documentation/ihi0048/latest/. @@ -62,9 +62,9 @@ impl MmioReg for SharedIrqReg { // read-as-zero/write-ignore (RAZ/WI) policy. // The first part of a shared-irq register, the one corresponding to the // SGI and PPI IRQs (0-32) is RAZ/WI, so we skip it. - let start = self.offset + u64::from(IRQ_BASE) * u64::from(self.bits_per_irq) / 8; + let start = self.offset + u64::from(SPI_START) * u64::from(self.bits_per_irq) / 8; - let size_in_bits = u64::from(self.bits_per_irq) * u64::from(IRQ_MAX - IRQ_BASE); + let size_in_bits = u64::from(self.bits_per_irq) * u64::from(GSI_LEGACY_NUM); let mut size_in_bytes = size_in_bits / 8; if size_in_bits % 8 > 0 { size_in_bytes += 1; diff --git a/src/vmm/src/arch/aarch64/gic/gicv2/regs/mod.rs b/src/vmm/src/arch/aarch64/gic/gicv2/regs/mod.rs index 8bb26ce2bcd..2b617716fe2 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv2/regs/mod.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv2/regs/mod.rs @@ -22,6 +22,7 @@ pub fn save_state(fd: &DeviceFd, mpidrs: &[u64]) -> Result { Ok(GicState { dist: dist_regs::get_dist_regs(fd)?, gic_vcpu_states: vcpu_states, + ..Default::default() }) } diff --git a/src/vmm/src/arch/aarch64/gic/gicv3/mod.rs b/src/vmm/src/arch/aarch64/gic/gicv3/mod.rs index 558b47ab065..5d131cf7b76 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv3/mod.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv3/mod.rs @@ -1,7 +1,7 @@ // Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 -mod regs; +pub mod regs; use kvm_ioctls::{DeviceFd, VmFd}; @@ -18,19 +18,26 @@ impl std::ops::Deref for GICv3 { } } +impl std::ops::DerefMut for GICv3 { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + impl GICv3 { // Unfortunately bindgen omits defines that are based on other defines. // See arch/arm64/include/uapi/asm/kvm.h file from the linux kernel. const SZ_64K: u64 = 0x0001_0000; const KVM_VGIC_V3_DIST_SIZE: u64 = GICv3::SZ_64K; const KVM_VGIC_V3_REDIST_SIZE: u64 = (2 * GICv3::SZ_64K); + const GIC_V3_ITS_SIZE: u64 = 0x2_0000; // Device trees specific constants const ARCH_GIC_V3_MAINT_IRQ: u32 = 9; /// Get the address of the GIC distributor. fn get_dist_addr() -> u64 { - super::layout::MAPPED_IO_START - GICv3::KVM_VGIC_V3_DIST_SIZE + super::layout::MMIO32_MEM_START - GICv3::KVM_VGIC_V3_DIST_SIZE } /// Get the size of the GIC distributor. @@ -48,6 +55,16 @@ impl GICv3 { vcpu_count * GICv3::KVM_VGIC_V3_REDIST_SIZE } + /// Get the MSI address + fn get_msi_address(vcpu_count: u64) -> u64 { + Self::get_redists_addr(vcpu_count) - GICv3::GIC_V3_ITS_SIZE + } + + /// Get the MSI size + const fn get_msi_size() -> u64 { + GICv3::GIC_V3_ITS_SIZE + } + pub const VERSION: u32 = kvm_bindings::kvm_device_type_KVM_DEV_TYPE_ARM_VGIC_V3; pub fn fdt_compatibility(&self) -> &str { @@ -59,30 +76,43 @@ impl GICv3 { } /// Create the GIC device object - pub fn create_device(fd: DeviceFd, vcpu_count: u64) -> Self { - GICv3(super::GIC { - fd, + pub fn create_device(vm: &VmFd, vcpu_count: u64) -> Result { + // Create the GIC device + let mut gic_device = kvm_bindings::kvm_create_device { + type_: Self::VERSION, + fd: 0, + flags: 0, + }; + + let gic_fd = vm + .create_device(&mut gic_device) + .map_err(GicError::CreateGIC)?; + + Ok(GICv3(super::GIC { + fd: gic_fd, properties: [ GICv3::get_dist_addr(), GICv3::get_dist_size(), GICv3::get_redists_addr(vcpu_count), GICv3::get_redists_size(vcpu_count), ], + msi_properties: Some([GICv3::get_msi_address(vcpu_count), GICv3::get_msi_size()]), vcpu_count, - }) + its_device: None, + })) } pub fn save_device(&self, mpidrs: &[u64]) -> Result { - regs::save_state(&self.fd, mpidrs) + regs::save_state(&self.fd, self.its_device.as_ref().unwrap(), mpidrs) } pub fn restore_device(&self, mpidrs: &[u64], state: &GicState) -> Result<(), GicError> { - regs::restore_state(&self.fd, mpidrs, state) + regs::restore_state(&self.fd, self.its_device.as_ref().unwrap(), mpidrs, state) } pub fn init_device_attributes(gic_device: &Self) -> Result<(), GicError> { // Setting up the distributor attribute. - // We are placing the GIC below 1GB so we need to substract the size of the distributor. + // We are placing the GIC below 1GB so we need to subtract the size of the distributor. Self::set_device_attribute( gic_device.device_fd(), kvm_bindings::KVM_DEV_ARM_VGIC_GRP_ADDR, @@ -104,25 +134,45 @@ impl GICv3 { Ok(()) } - /// Initialize a GIC device - pub fn init_device(vm: &VmFd) -> Result { - let mut gic_device = kvm_bindings::kvm_create_device { - type_: Self::VERSION, + fn init_its(vm: &VmFd, gic_device: &mut Self) -> Result<(), GicError> { + // ITS part attributes + let mut its_device = kvm_bindings::kvm_create_device { + type_: kvm_bindings::kvm_device_type_KVM_DEV_TYPE_ARM_VGIC_ITS, fd: 0, flags: 0, }; - vm.create_device(&mut gic_device) - .map_err(GicError::CreateGIC) + let its_fd = vm + .create_device(&mut its_device) + .map_err(GicError::CreateGIC)?; + + // Setting up the ITS attributes + Self::set_device_attribute( + &its_fd, + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_ADDR, + u64::from(kvm_bindings::KVM_VGIC_ITS_ADDR_TYPE), + &Self::get_msi_address(gic_device.vcpu_count()) as *const u64 as u64, + 0, + )?; + + Self::set_device_attribute( + &its_fd, + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_CTRL, + u64::from(kvm_bindings::KVM_DEV_ARM_VGIC_CTRL_INIT), + 0, + 0, + )?; + + gic_device.its_device = Some(its_fd); + Ok(()) } /// Method to initialize the GIC device pub fn create(vm: &VmFd, vcpu_count: u64) -> Result { - let vgic_fd = Self::init_device(vm)?; - - let device = Self::create_device(vgic_fd, vcpu_count); + let mut device = Self::create_device(vm, vcpu_count)?; Self::init_device_attributes(&device)?; + Self::init_its(vm, &mut device)?; Self::finalize_device(&device)?; @@ -134,9 +184,9 @@ impl GICv3 { // On arm there are 3 types of interrupts: SGI (0-15), PPI (16-31), SPI (32-1020). // SPIs are used to signal interrupts from various peripherals accessible across // the whole system so these are the ones that we increment when adding a new virtio device. - // KVM_DEV_ARM_VGIC_GRP_NR_IRQS sets the highest SPI number. Consequently, we will have a - // total of `super::layout::IRQ_MAX - 32` usable SPIs in our microVM. - let nr_irqs: u32 = super::layout::IRQ_MAX; + // KVM_DEV_ARM_VGIC_GRP_NR_IRQS sets the number of interrupts (SGI, PPI, and SPI). + // Consequently, we need to add 32 to the number of SPIs ("legacy GSI"). + let nr_irqs: u32 = crate::arch::GSI_LEGACY_NUM + super::layout::SPI_START; let nr_irqs_ptr = &nr_irqs as *const u32; Self::set_device_attribute( gic_device.device_fd(), @@ -184,14 +234,14 @@ impl GICv3 { /// RDIST pending tables into guest RAM. /// /// The tables get flushed to guest RAM whenever the VM gets stopped. -fn save_pending_tables(fd: &DeviceFd) -> Result<(), GicError> { +fn save_pending_tables(gic_device: &DeviceFd) -> Result<(), GicError> { let init_gic_attr = kvm_bindings::kvm_device_attr { group: kvm_bindings::KVM_DEV_ARM_VGIC_GRP_CTRL, attr: u64::from(kvm_bindings::KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES), addr: 0, flags: 0, }; - fd.set_device_attr(&init_gic_attr).map_err(|err| { + gic_device.set_device_attr(&init_gic_attr).map_err(|err| { GicError::DeviceAttribute(err, true, kvm_bindings::KVM_DEV_ARM_VGIC_GRP_CTRL) }) } diff --git a/src/vmm/src/arch/aarch64/gic/gicv3/regs/dist_regs.rs b/src/vmm/src/arch/aarch64/gic/gicv3/regs/dist_regs.rs index 96c617dcc17..5a6eafb7003 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv3/regs/dist_regs.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv3/regs/dist_regs.rs @@ -8,7 +8,7 @@ use kvm_ioctls::DeviceFd; use crate::arch::aarch64::gic::GicError; use crate::arch::aarch64::gic::regs::{GicRegState, MmioReg, SimpleReg, VgicRegEngine}; -use crate::arch::{IRQ_BASE, IRQ_MAX}; +use crate::arch::{GSI_LEGACY_NUM, SPI_START}; // Distributor registers as detailed at page 456 from // https://static.docs.arm.com/ihi0069/c/IHI0069C_gic_architecture_specification.pdf. @@ -64,9 +64,9 @@ impl MmioReg for SharedIrqReg { // read-as-zero/write-ignore (RAZ/WI) policy. // The first part of a shared-irq register, the one corresponding to the // SGI and PPI IRQs (0-32) is RAZ/WI, so we skip it. - let start = self.offset + u64::from(IRQ_BASE) * u64::from(self.bits_per_irq) / 8; + let start = self.offset + u64::from(SPI_START) * u64::from(self.bits_per_irq) / 8; - let size_in_bits = u64::from(self.bits_per_irq) * u64::from(IRQ_MAX - IRQ_BASE); + let size_in_bits = u64::from(self.bits_per_irq) * u64::from(GSI_LEGACY_NUM); let mut size_in_bytes = size_in_bits / 8; if size_in_bits % 8 > 0 { size_in_bytes += 1; diff --git a/src/vmm/src/arch/aarch64/gic/gicv3/regs/its_regs.rs b/src/vmm/src/arch/aarch64/gic/gicv3/regs/its_regs.rs new file mode 100644 index 00000000000..ee4ecafba1e --- /dev/null +++ b/src/vmm/src/arch/aarch64/gic/gicv3/regs/its_regs.rs @@ -0,0 +1,135 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use kvm_bindings::{ + KVM_DEV_ARM_ITS_RESTORE_TABLES, KVM_DEV_ARM_ITS_SAVE_TABLES, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_GRP_ITS_REGS, +}; +use kvm_ioctls::DeviceFd; +use serde::{Deserialize, Serialize}; + +use crate::arch::aarch64::gic::GicError; + +// ITS registers that we want to preserve across snapshots +const GITS_CTLR: u32 = 0x0000; +const GITS_IIDR: u32 = 0x0004; +const GITS_CBASER: u32 = 0x0080; +const GITS_CWRITER: u32 = 0x0088; +const GITS_CREADR: u32 = 0x0090; +const GITS_BASER: u32 = 0x0100; + +fn set_device_attribute( + its_device: &DeviceFd, + group: u32, + attr: u32, + val: u64, +) -> Result<(), GicError> { + let gicv3_its_attr = kvm_bindings::kvm_device_attr { + group, + attr: attr as u64, + addr: &val as *const u64 as u64, + flags: 0, + }; + + its_device + .set_device_attr(&gicv3_its_attr) + .map_err(|err| GicError::DeviceAttribute(err, true, group)) +} + +fn get_device_attribute(its_device: &DeviceFd, group: u32, attr: u32) -> Result { + let mut val = 0; + + let mut gicv3_its_attr = kvm_bindings::kvm_device_attr { + group, + attr: attr as u64, + addr: &mut val as *mut u64 as u64, + flags: 0, + }; + + // SAFETY: gicv3_its_attr.addr is safe to write to. + unsafe { its_device.get_device_attr(&mut gicv3_its_attr) } + .map_err(|err| GicError::DeviceAttribute(err, false, group))?; + + Ok(val) +} + +fn its_read_register(its_fd: &DeviceFd, attr: u32) -> Result { + get_device_attribute(its_fd, KVM_DEV_ARM_VGIC_GRP_ITS_REGS, attr) +} + +fn its_set_register(its_fd: &DeviceFd, attr: u32, val: u64) -> Result<(), GicError> { + set_device_attribute(its_fd, KVM_DEV_ARM_VGIC_GRP_ITS_REGS, attr, val) +} + +pub fn its_save_tables(its_fd: &DeviceFd) -> Result<(), GicError> { + set_device_attribute( + its_fd, + KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_ITS_SAVE_TABLES, + 0, + ) +} + +pub fn its_restore_tables(its_fd: &DeviceFd) -> Result<(), GicError> { + set_device_attribute( + its_fd, + KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_ITS_RESTORE_TABLES, + 0, + ) +} + +/// ITS registers that we save/restore during snapshot +#[derive(Debug, Default, Serialize, Deserialize)] +pub struct ItsRegisterState { + iidr: u64, + cbaser: u64, + creadr: u64, + cwriter: u64, + baser: [u64; 8], + ctlr: u64, +} + +impl ItsRegisterState { + /// Save ITS state + pub fn save(its_fd: &DeviceFd) -> Result { + let mut state = ItsRegisterState::default(); + + for i in 0..8 { + state.baser[i as usize] = its_read_register(its_fd, GITS_BASER + i * 8)?; + } + state.ctlr = its_read_register(its_fd, GITS_CTLR)?; + state.cbaser = its_read_register(its_fd, GITS_CBASER)?; + state.creadr = its_read_register(its_fd, GITS_CREADR)?; + state.cwriter = its_read_register(its_fd, GITS_CWRITER)?; + state.iidr = its_read_register(its_fd, GITS_IIDR)?; + + Ok(state) + } + + /// Restore ITS state + /// + /// We need to restore ITS registers in a very specific order for things to work. Take a look + /// at: + /// https://elixir.bootlin.com/linux/v6.1.141/source/Documentation/virt/kvm/devices/arm-vgic-its.rst#L60 + /// and + /// https://elixir.bootlin.com/linux/v6.1.141/source/Documentation/virt/kvm/devices/arm-vgic-its.rst#L123 + /// + /// for more details, but TL;DR is: + /// + /// We need to restore GITS_CBASER, GITS_CREADER, GITS_CWRITER, GITS_BASER and GITS_IIDR + /// registers before restoring ITS tables from guest memory. We also need to set GITS_CTLR + /// last. + pub fn restore(&self, its_fd: &DeviceFd) -> Result<(), GicError> { + its_set_register(its_fd, GITS_IIDR, self.iidr)?; + its_set_register(its_fd, GITS_CBASER, self.cbaser)?; + its_set_register(its_fd, GITS_CREADR, self.creadr)?; + its_set_register(its_fd, GITS_CWRITER, self.cwriter)?; + for i in 0..8 { + its_set_register(its_fd, GITS_BASER + i * 8, self.baser[i as usize])?; + } + // We need to restore saved ITS tables before restoring GITS_CTLR + its_restore_tables(its_fd)?; + its_set_register(its_fd, GITS_CTLR, self.ctlr) + } +} diff --git a/src/vmm/src/arch/aarch64/gic/gicv3/regs/mod.rs b/src/vmm/src/arch/aarch64/gic/gicv3/regs/mod.rs index 0531766dc54..3df0d4642d7 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv3/regs/mod.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv3/regs/mod.rs @@ -3,45 +3,63 @@ mod dist_regs; mod icc_regs; +pub mod its_regs; mod redist_regs; +use its_regs::{ItsRegisterState, its_save_tables}; use kvm_ioctls::DeviceFd; use crate::arch::aarch64::gic::GicError; use crate::arch::aarch64::gic::regs::{GicState, GicVcpuState}; /// Save the state of the GIC device. -pub fn save_state(fd: &DeviceFd, mpidrs: &[u64]) -> Result { +pub fn save_state( + gic_device: &DeviceFd, + its_device: &DeviceFd, + mpidrs: &[u64], +) -> Result { // Flush redistributors pending tables to guest RAM. - super::save_pending_tables(fd)?; + super::save_pending_tables(gic_device)?; + // Flush ITS tables into guest memory. + its_save_tables(its_device)?; let mut vcpu_states = Vec::with_capacity(mpidrs.len()); for mpidr in mpidrs { vcpu_states.push(GicVcpuState { - rdist: redist_regs::get_redist_regs(fd, *mpidr)?, - icc: icc_regs::get_icc_regs(fd, *mpidr)?, + rdist: redist_regs::get_redist_regs(gic_device, *mpidr)?, + icc: icc_regs::get_icc_regs(gic_device, *mpidr)?, }) } + let its_state = ItsRegisterState::save(its_device)?; + Ok(GicState { - dist: dist_regs::get_dist_regs(fd)?, + dist: dist_regs::get_dist_regs(gic_device)?, gic_vcpu_states: vcpu_states, + its_state: Some(its_state), }) } /// Restore the state of the GIC device. -pub fn restore_state(fd: &DeviceFd, mpidrs: &[u64], state: &GicState) -> Result<(), GicError> { - dist_regs::set_dist_regs(fd, &state.dist)?; +pub fn restore_state( + gic_device: &DeviceFd, + its_device: &DeviceFd, + mpidrs: &[u64], + state: &GicState, +) -> Result<(), GicError> { + dist_regs::set_dist_regs(gic_device, &state.dist)?; if mpidrs.len() != state.gic_vcpu_states.len() { return Err(GicError::InconsistentVcpuCount); } for (mpidr, vcpu_state) in mpidrs.iter().zip(&state.gic_vcpu_states) { - redist_regs::set_redist_regs(fd, *mpidr, &vcpu_state.rdist)?; - icc_regs::set_icc_regs(fd, *mpidr, &vcpu_state.icc)?; + redist_regs::set_redist_regs(gic_device, *mpidr, &vcpu_state.rdist)?; + icc_regs::set_icc_regs(gic_device, *mpidr, &vcpu_state.icc)?; } - Ok(()) + // Safe to unwrap here, as we know we support an ITS device, so `its_state.is_some()` is always + // `true`. + state.its_state.as_ref().unwrap().restore(its_device) } #[cfg(test)] @@ -59,9 +77,10 @@ mod tests { let vm = kvm.create_vm().unwrap(); let gic = create_gic(&vm, 1, Some(GICVersion::GICV3)).expect("Cannot create gic"); let gic_fd = gic.device_fd(); + let its_fd = gic.its_fd().unwrap(); let mpidr = vec![1]; - let res = save_state(gic_fd, &mpidr); + let res = save_state(gic_fd, its_fd, &mpidr); // We will receive an error if trying to call before creating vcpu. assert_eq!( format!("{:?}", res.unwrap_err()), @@ -73,8 +92,9 @@ mod tests { let _vcpu = vm.create_vcpu(0).unwrap(); let gic = create_gic(&vm, 1, Some(GICVersion::GICV3)).expect("Cannot create gic"); let gic_fd = gic.device_fd(); + let its_fd = gic.its_fd().unwrap(); - let vm_state = save_state(gic_fd, &mpidr).unwrap(); + let vm_state = save_state(gic_fd, its_fd, &mpidr).unwrap(); let val: u32 = 0; let gicd_statusr_off = 0x0010u64; let mut gic_dist_attr = kvm_bindings::kvm_device_attr { @@ -94,7 +114,7 @@ mod tests { assert_eq!(gicd_statusr.chunks[0], val); assert_eq!(vm_state.dist.len(), 12); - restore_state(gic_fd, &mpidr, &vm_state).unwrap(); - restore_state(gic_fd, &[1, 2], &vm_state).unwrap_err(); + restore_state(gic_fd, its_fd, &mpidr, &vm_state).unwrap(); + restore_state(gic_fd, its_fd, &[1, 2], &vm_state).unwrap_err(); } } diff --git a/src/vmm/src/arch/aarch64/gic/gicv3/regs/redist_regs.rs b/src/vmm/src/arch/aarch64/gic/gicv3/regs/redist_regs.rs index 4d1ba3292c1..96aaebc87bd 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv3/regs/redist_regs.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv3/regs/redist_regs.rs @@ -28,11 +28,11 @@ const GICR_ICFGR0: SimpleReg = SimpleReg::new(GICR_SGI_OFFSET + 0x0C00, 8); // List with relevant redistributor registers that we will be restoring. static VGIC_RDIST_REGS: &[SimpleReg] = &[ - GICR_CTLR, GICR_STATUSR, GICR_WAKER, GICR_PROPBASER, GICR_PENDBASER, + GICR_CTLR, ]; // List with relevant SGI associated redistributor registers that we will be restoring. diff --git a/src/vmm/src/arch/aarch64/gic/mod.rs b/src/vmm/src/arch/aarch64/gic/mod.rs index cda423f478c..9bfabee1fea 100644 --- a/src/vmm/src/arch/aarch64/gic/mod.rs +++ b/src/vmm/src/arch/aarch64/gic/mod.rs @@ -21,8 +21,14 @@ pub struct GIC { /// GIC device properties, to be used for setting up the fdt entry properties: [u64; 4], + /// MSI properties of the GIC device + msi_properties: Option<[u64; 2]>, + /// Number of CPUs handled by the device vcpu_count: u64, + + /// ITS device + its_device: Option, } impl GIC { /// Returns the file descriptor of the GIC device @@ -80,6 +86,14 @@ impl GICDevice { } } + /// Returns the file descriptor of the ITS device, if any + pub fn its_fd(&self) -> Option<&DeviceFd> { + match self { + Self::V2(_) => None, + Self::V3(x) => x.its_device.as_ref(), + } + } + /// Returns an array with GIC device properties pub fn device_properties(&self) -> &[u64] { match self { @@ -88,6 +102,14 @@ impl GICDevice { } } + /// Returns an array with MSI properties if GIC supports it + pub fn msi_properties(&self) -> Option<&[u64; 2]> { + match self { + Self::V2(x) => x.msi_properties.as_ref(), + Self::V3(x) => x.msi_properties.as_ref(), + } + } + /// Returns the number of vCPUs this GIC handles pub fn vcpu_count(&self) -> u64 { match self { diff --git a/src/vmm/src/arch/aarch64/gic/regs.rs b/src/vmm/src/arch/aarch64/gic/regs.rs index 60987cc973d..1afa7acde9c 100644 --- a/src/vmm/src/arch/aarch64/gic/regs.rs +++ b/src/vmm/src/arch/aarch64/gic/regs.rs @@ -10,6 +10,7 @@ use kvm_ioctls::DeviceFd; use serde::{Deserialize, Serialize}; use crate::arch::aarch64::gic::GicError; +use crate::arch::aarch64::gic::gicv3::regs::its_regs::ItsRegisterState; #[derive(Debug, Serialize, Deserialize)] pub struct GicRegState { @@ -30,6 +31,8 @@ pub struct GicState { pub dist: Vec>, /// The state of the vcpu interfaces. pub gic_vcpu_states: Vec, + /// The state of the ITS device. Only present with GICv3. + pub its_state: Option, } /// Structure used for serializing the state of the GIC registers for a specific vCPU. diff --git a/src/vmm/src/arch/aarch64/layout.rs b/src/vmm/src/arch/aarch64/layout.rs index 8f95519830e..c4937e43c92 100644 --- a/src/vmm/src/arch/aarch64/layout.rs +++ b/src/vmm/src/arch/aarch64/layout.rs @@ -4,51 +4,53 @@ // ==== Address map in use in ARM development systems today ==== // // - 32-bit - - 36-bit - - 40-bit - -// 1024GB + + +-------------------+ <- 40-bit +// 1024GB + + +-------------------+ <- 40-bit // | | DRAM | // ~ ~ ~ ~ // | | | // | | | // | | | // | | | -// 544GB + + +-------------------+ +// 544GB + + +-------------------+ // | | Hole or DRAM | // | | | -// 512GB + + +-------------------+ +// 512GB + + +-------------------+ // | | Mapped | // | | I/O | // ~ ~ ~ ~ // | | | -// 256GB + + +-------------------+ +// 256GB + + +-------------------+ // | | Reserved | // ~ ~ ~ ~ // | | | -// 64GB + +-----------------------+-------------------+ <- 36-bit +// 64GB + +-----------------------+-------------------+ <- 36-bit // | | DRAM | // ~ ~ ~ ~ // | | | // | | | -// 34GB + +-----------------------+-------------------+ +// 34GB + +-----------------------+-------------------+ // | | Hole or DRAM | -// 32GB + +-----------------------+-------------------+ +// 32GB + +-----------------------+-------------------+ // | | Mapped I/O | // ~ ~ ~ ~ // | | | -// 16GB + +-----------------------+-------------------+ +// 16GB + +-----------------------+-------------------+ // | | Reserved | // ~ ~ ~ ~ -// 4GB +-------------------+-----------------------+-------------------+ <- 32-bit +// 4GB +-------------------+-----------------------+-------------------+ <- 32-bit // | 2GB of DRAM | // | | -// 2GB +-------------------+-----------------------+-------------------+ +// 2GB +-------------------+-----------------------+-------------------+ // | Mapped I/O | -// 1GB +-------------------+-----------------------+-------------------+ +// 1GB +-------------------+-----------------------+-------------------+ // | ROM & RAM & I/O | -// 0GB +-------------------+-----------------------+-------------------+ 0 +// 0GB +-------------------+-----------------------+-------------------+ 0 // - 32-bit - - 36-bit - - 40-bit - // // Taken from (http://infocenter.arm.com/help/topic/com.arm.doc.den0001c/DEN0001C_principles_of_arm_memory_maps.pdf). +use crate::device_manager::mmio::MMIO_LEN; + /// Start of RAM on 64 bit ARM. pub const DRAM_MEM_START: u64 = 0x8000_0000; // 2 GB. /// The maximum RAM size. @@ -74,19 +76,66 @@ pub const FDT_MAX_SIZE: usize = 0x20_0000; // * bigger than 32 // * less than 1023 and // * a multiple of 32. -/// The highest usable SPI on aarch64. -pub const IRQ_MAX: u32 = 128; +// The first 32 SPIs are reserved, but KVM already shifts the gsi we +// pass, so we go from 0 to 95 for legacy gsis ("irq") and the remaining +// we use for MSI. +/// Offset of first SPI in the GIC +pub const SPI_START: u32 = 32; +/// Last possible SPI in the GIC (128 total SPIs) +pub const SPI_END: u32 = 127; +/// First usable GSI id on aarch64 (corresponds to SPI #32). +pub const GSI_LEGACY_START: u32 = 0; +/// There are 128 SPIs available, but the first 32 are reserved +pub const GSI_LEGACY_NUM: u32 = SPI_END - SPI_START + 1; +/// Last available GSI +pub const GSI_LEGACY_END: u32 = GSI_LEGACY_START + GSI_LEGACY_NUM - 1; +/// First GSI used by MSI after legacy GSI +pub const GSI_MSI_START: u32 = GSI_LEGACY_END + 1; +/// The highest available GSI in KVM (KVM_MAX_IRQ_ROUTES=4096) +pub const GSI_MSI_END: u32 = 4095; +/// Number of GSI available for MSI. +pub const GSI_MSI_NUM: u32 = GSI_MSI_END - GSI_MSI_START + 1; -/// First usable interrupt on aarch64. -pub const IRQ_BASE: u32 = 32; +/// The start of the memory area reserved for MMIO 32-bit accesses. +/// Below this address will reside the GIC, above this address will reside the MMIO devices. +pub const MMIO32_MEM_START: u64 = 1 << 30; // 1GiB +/// The size of the memory area reserved for MMIO 32-bit accesses (1GiB). +pub const MMIO32_MEM_SIZE: u64 = DRAM_MEM_START - MMIO32_MEM_START; -// The Linux kernel automatically shifts the GSI by 32 if it is an SPI, -// allowing us to start numbering from 0 instead of 32. -/// The first usable GSI on aarch64. -pub const GSI_BASE: u32 = 0; +// The rest of the MMIO address space (256 MiB) we dedicate to PCIe for memory-mapped access to +// configuration. +/// Size of MMIO region for PCIe configuration accesses. +pub const PCI_MMCONFIG_SIZE: u64 = 256 << 20; +/// Start of MMIO region for PCIe configuration accesses. +pub const PCI_MMCONFIG_START: u64 = DRAM_MEM_START - PCI_MMCONFIG_SIZE; +/// MMIO space per PCIe segment +pub const PCI_MMIO_CONFIG_SIZE_PER_SEGMENT: u64 = 4096 * 256; -/// The maximum usable GSI on aarch64. -pub const GSI_MAX: u32 = IRQ_MAX - IRQ_BASE - 1; +// We reserve 768 MiB for devices at the beginning of the MMIO region. This includes space both for +// pure MMIO and PCIe devices. -/// Below this address will reside the GIC, above this address will reside the MMIO devices. -pub const MAPPED_IO_START: u64 = 1 << 30; // 1 GB +/// Memory region start for boot device. +pub const BOOT_DEVICE_MEM_START: u64 = MMIO32_MEM_START; +/// Memory region start for RTC device. +pub const RTC_MEM_START: u64 = BOOT_DEVICE_MEM_START + MMIO_LEN; +/// Memory region start for Serial device. +pub const SERIAL_MEM_START: u64 = RTC_MEM_START + MMIO_LEN; + +/// Beginning of memory region for device MMIO 32-bit accesses +pub const MEM_32BIT_DEVICES_START: u64 = SERIAL_MEM_START + MMIO_LEN; +/// Size of memory region for device MMIO 32-bit accesses +pub const MEM_32BIT_DEVICES_SIZE: u64 = PCI_MMCONFIG_START - MEM_32BIT_DEVICES_START; + +// 64-bits region for MMIO accesses +/// The start of the memory area reserved for MMIO 64-bit accesses. +pub const MMIO64_MEM_START: u64 = 256 << 30; +/// The size of the memory area reserved for MMIO 64-bit accesses. +pub const MMIO64_MEM_SIZE: u64 = 256 << 30; + +// At the moment, all of this region goes to devices +/// Beginning of memory region for device MMIO 64-bit accesses +pub const MEM_64BIT_DEVICES_START: u64 = MMIO64_MEM_START; +/// Size of memory region for device MMIO 32-bit accesses +pub const MEM_64BIT_DEVICES_SIZE: u64 = MMIO64_MEM_SIZE; +/// First address past the 64-bit MMIO gap +pub const FIRST_ADDR_PAST_64BITS_MMIO: u64 = MMIO64_MEM_START + MMIO64_MEM_SIZE; diff --git a/src/vmm/src/arch/aarch64/mod.rs b/src/vmm/src/arch/aarch64/mod.rs index ead827c08c4..a599db5dea7 100644 --- a/src/vmm/src/arch/aarch64/mod.rs +++ b/src/vmm/src/arch/aarch64/mod.rs @@ -24,15 +24,15 @@ use linux_loader::loader::pe::PE as Loader; use linux_loader::loader::{Cmdline, KernelLoader}; use vm_memory::GuestMemoryError; -use crate::arch::{BootProtocol, EntryPoint}; +use crate::arch::{BootProtocol, EntryPoint, arch_memory_regions_with_gap}; use crate::cpu_config::aarch64::{CpuConfiguration, CpuConfigurationError}; use crate::cpu_config::templates::CustomCpuTemplate; use crate::initrd::InitrdConfig; -use crate::utils::{align_up, usize_to_u64}; +use crate::utils::{align_up, u64_to_usize, usize_to_u64}; use crate::vmm_config::machine_config::MachineConfig; use crate::vstate::memory::{Address, Bytes, GuestAddress, GuestMemory, GuestMemoryMmap}; use crate::vstate::vcpu::KvmVcpuError; -use crate::{Vcpu, VcpuConfig, Vmm, logger}; +use crate::{DeviceManager, Kvm, Vcpu, VcpuConfig, Vm, logger}; /// Errors thrown while configuring aarch64 system. #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -51,47 +51,42 @@ pub enum ConfigurationError { VcpuConfigure(#[from] KvmVcpuError), } -/// The start of the memory area reserved for MMIO devices. -pub const MMIO_MEM_START: u64 = layout::MAPPED_IO_START; -/// The size of the memory area reserved for MMIO devices. -pub const MMIO_MEM_SIZE: u64 = layout::DRAM_MEM_START - layout::MAPPED_IO_START; //>> 1GB - /// Returns a Vec of the valid memory addresses for aarch64. /// See [`layout`](layout) module for a drawing of the specific memory model for this platform. -/// -/// The `offset` parameter specified the offset from [`layout::DRAM_MEM_START`]. -pub fn arch_memory_regions(offset: usize, size: usize) -> Vec<(GuestAddress, usize)> { +pub fn arch_memory_regions(size: usize) -> Vec<(GuestAddress, usize)> { assert!(size > 0, "Attempt to allocate guest memory of length 0"); - assert!( - offset.checked_add(size).is_some(), - "Attempt to allocate guest memory such that the address space would wrap around" - ); - assert!( - offset < layout::DRAM_MEM_MAX_SIZE, - "offset outside allowed DRAM range" - ); - let dram_size = min(size, layout::DRAM_MEM_MAX_SIZE - offset); + let dram_size = min(size, layout::DRAM_MEM_MAX_SIZE); if dram_size != size { logger::warn!( - "Requested offset/memory size {}/{} exceeds architectural maximum (1022GiB). Size has \ - been truncated to {}", - offset, + "Requested memory size {} exceeds architectural maximum (1022GiB). Size has been \ + truncated to {}", size, dram_size ); } - vec![( - GuestAddress(layout::DRAM_MEM_START + offset as u64), + let mut regions = vec![]; + if let Some((offset, remaining)) = arch_memory_regions_with_gap( + &mut regions, + u64_to_usize(layout::DRAM_MEM_START), dram_size, - )] + u64_to_usize(layout::MMIO64_MEM_START), + u64_to_usize(layout::MMIO64_MEM_SIZE), + ) { + regions.push((GuestAddress(offset as u64), remaining)); + } + + regions } /// Configures the system for booting Linux. +#[allow(clippy::too_many_arguments)] pub fn configure_system_for_boot( - vmm: &mut Vmm, + kvm: &Kvm, + vm: &Vm, + device_manager: &mut DeviceManager, vcpus: &mut [Vcpu], machine_config: &MachineConfig, cpu_template: &CustomCpuTemplate, @@ -111,11 +106,11 @@ pub fn configure_system_for_boot( cpu_config, }; - let optional_capabilities = vmm.kvm.optional_capabilities(); + let optional_capabilities = kvm.optional_capabilities(); // Configure vCPUs with normalizing and setting the generated CPU configuration. for vcpu in vcpus.iter_mut() { vcpu.kvm_vcpu.configure( - vmm.vm.guest_memory(), + vm.guest_memory(), entry_point, &vcpu_config, &optional_capabilities, @@ -131,19 +126,16 @@ pub fn configure_system_for_boot( .expect("Cannot create cstring from cmdline string"); let fdt = fdt::create_fdt( - vmm.vm.guest_memory(), + vm.guest_memory(), vcpu_mpidr, cmdline, - vmm.mmio_device_manager.get_device_info(), - vmm.vm.get_irqchip(), - &vmm.acpi_device_manager.vmgenid, + device_manager, + vm.get_irqchip(), initrd, )?; - let fdt_address = GuestAddress(get_fdt_addr(vmm.vm.guest_memory())); - vmm.vm - .guest_memory() - .write_slice(fdt.as_slice(), fdt_address)?; + let fdt_address = GuestAddress(get_fdt_addr(vm.guest_memory())); + vm.guest_memory().write_slice(fdt.as_slice(), fdt_address)?; Ok(()) } @@ -212,39 +204,66 @@ pub fn load_kernel( #[cfg(kani)] mod verification { - use vm_memory::GuestAddress; - - use crate::arch::aarch64::layout; + use crate::arch::aarch64::layout::{ + DRAM_MEM_MAX_SIZE, DRAM_MEM_START, FIRST_ADDR_PAST_64BITS_MMIO, MMIO64_MEM_START, + }; use crate::arch::arch_memory_regions; #[kani::proof] #[kani::unwind(3)] fn verify_arch_memory_regions() { - let offset: u64 = kani::any::(); - let len: u64 = kani::any::(); - + let len: usize = kani::any::(); kani::assume(len > 0); - kani::assume(offset.checked_add(len).is_some()); - kani::assume(offset < layout::DRAM_MEM_MAX_SIZE as u64); - let regions = arch_memory_regions(offset as usize, len as usize); + let regions = arch_memory_regions(len); - // No MMIO gap on ARM - assert_eq!(regions.len(), 1); + for region in ®ions { + println!( + "region: [{:x}:{:x})", + region.0.0, + region.0.0 + region.1 as u64 + ); + } - let (GuestAddress(start), actual_len) = regions[0]; - let actual_len = actual_len as u64; + // On Arm we have one MMIO gap that might fall within addressable ranges, + // so we can get either 1 or 2 regions. + assert!(regions.len() >= 1); + assert!(regions.len() <= 2); - assert_eq!(start, layout::DRAM_MEM_START + offset); - assert!(actual_len <= layout::DRAM_MEM_MAX_SIZE as u64); + // The total length of all regions cannot exceed DRAM_MEM_MAX_SIZE + let actual_len = regions.iter().map(|&(_, len)| len).sum::(); + assert!(actual_len <= DRAM_MEM_MAX_SIZE); + // The total length is smaller or equal to the length we asked assert!(actual_len <= len); + // If it's smaller, it's because we asked more than the the maximum possible. + if (actual_len) < len { + assert!(len > DRAM_MEM_MAX_SIZE); + } - if actual_len < len { - assert_eq!( - start + actual_len, - layout::DRAM_MEM_START + layout::DRAM_MEM_MAX_SIZE as u64 - ); - assert!(offset + len >= layout::DRAM_MEM_MAX_SIZE as u64); + // No region overlaps the 64-bit MMIO gap + assert!( + regions + .iter() + .all(|&(start, len)| start.0 >= FIRST_ADDR_PAST_64BITS_MMIO + || start.0 + len as u64 <= MMIO64_MEM_START) + ); + + // All regions start after our DRAM_MEM_START + assert!(regions.iter().all(|&(start, _)| start.0 >= DRAM_MEM_START)); + + // All regions have non-zero length + assert!(regions.iter().all(|&(_, len)| len > 0)); + + // If there's two regions, they perfectly snuggle up the 64bit MMIO gap + if regions.len() == 2 { + kani::cover!(); + + // The very first address should be DRAM_MEM_START + assert_eq!(regions[0].0.0, DRAM_MEM_START); + // The first region ends at the beginning of the 64 bits gap. + assert_eq!(regions[0].0.0 + regions[0].1 as u64, MMIO64_MEM_START); + // The second region starts exactly after the 64 bits gap. + assert_eq!(regions[1].0.0, FIRST_ADDR_PAST_64BITS_MMIO); } } } @@ -252,33 +271,42 @@ mod verification { #[cfg(test)] mod tests { use super::*; + use crate::arch::aarch64::layout::{ + DRAM_MEM_MAX_SIZE, DRAM_MEM_START, FDT_MAX_SIZE, FIRST_ADDR_PAST_64BITS_MMIO, + MMIO64_MEM_START, + }; use crate::test_utils::arch_mem; #[test] fn test_regions_lt_1024gb() { - let regions = arch_memory_regions(0, 1usize << 29); + let regions = arch_memory_regions(1usize << 29); assert_eq!(1, regions.len()); - assert_eq!(GuestAddress(super::layout::DRAM_MEM_START), regions[0].0); + assert_eq!(GuestAddress(DRAM_MEM_START), regions[0].0); assert_eq!(1usize << 29, regions[0].1); } #[test] fn test_regions_gt_1024gb() { - let regions = arch_memory_regions(0, 1usize << 41); - assert_eq!(1, regions.len()); - assert_eq!(GuestAddress(super::layout::DRAM_MEM_START), regions[0].0); - assert_eq!(super::layout::DRAM_MEM_MAX_SIZE, regions[0].1); + let regions = arch_memory_regions(1usize << 41); + assert_eq!(2, regions.len()); + assert_eq!(GuestAddress(DRAM_MEM_START), regions[0].0); + assert_eq!(MMIO64_MEM_START - DRAM_MEM_START, regions[0].1 as u64); + assert_eq!(GuestAddress(FIRST_ADDR_PAST_64BITS_MMIO), regions[1].0); + assert_eq!( + DRAM_MEM_MAX_SIZE as u64 - MMIO64_MEM_START + DRAM_MEM_START, + regions[1].1 as u64 + ); } #[test] fn test_get_fdt_addr() { - let mem = arch_mem(layout::FDT_MAX_SIZE - 0x1000); - assert_eq!(get_fdt_addr(&mem), layout::DRAM_MEM_START); + let mem = arch_mem(FDT_MAX_SIZE - 0x1000); + assert_eq!(get_fdt_addr(&mem), DRAM_MEM_START); - let mem = arch_mem(layout::FDT_MAX_SIZE); - assert_eq!(get_fdt_addr(&mem), layout::DRAM_MEM_START); + let mem = arch_mem(FDT_MAX_SIZE); + assert_eq!(get_fdt_addr(&mem), DRAM_MEM_START); - let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); - assert_eq!(get_fdt_addr(&mem), 0x1000 + layout::DRAM_MEM_START); + let mem = arch_mem(FDT_MAX_SIZE + 0x1000); + assert_eq!(get_fdt_addr(&mem), 0x1000 + DRAM_MEM_START); } } diff --git a/src/vmm/src/arch/aarch64/output_GICv3.dtb b/src/vmm/src/arch/aarch64/output_GICv3.dtb index 03fba87f4fe..35f4e9b63a3 100644 Binary files a/src/vmm/src/arch/aarch64/output_GICv3.dtb and b/src/vmm/src/arch/aarch64/output_GICv3.dtb differ diff --git a/src/vmm/src/arch/aarch64/output_initrd_GICv3.dtb b/src/vmm/src/arch/aarch64/output_initrd_GICv3.dtb index 90e4a6cc0e2..fb6147ade9c 100644 Binary files a/src/vmm/src/arch/aarch64/output_initrd_GICv3.dtb and b/src/vmm/src/arch/aarch64/output_initrd_GICv3.dtb differ diff --git a/src/vmm/src/arch/aarch64/vcpu.rs b/src/vmm/src/arch/aarch64/vcpu.rs index 5d49dacac19..7a591bdee91 100644 --- a/src/vmm/src/arch/aarch64/vcpu.rs +++ b/src/vmm/src/arch/aarch64/vcpu.rs @@ -7,6 +7,7 @@ use std::fmt::{Debug, Write}; use std::mem::offset_of; +use std::sync::Arc; use kvm_bindings::*; use kvm_ioctls::{VcpuExit, VcpuFd, VmFd}; @@ -119,7 +120,7 @@ pub struct KvmVcpu { #[derive(Default, Debug)] pub struct Peripherals { /// mmio bus. - pub mmio_bus: Option, + pub mmio_bus: Option>, } impl KvmVcpu { diff --git a/src/vmm/src/arch/aarch64/vm.rs b/src/vmm/src/arch/aarch64/vm.rs index e54723f5b6d..eaec0932a42 100644 --- a/src/vmm/src/arch/aarch64/vm.rs +++ b/src/vmm/src/arch/aarch64/vm.rs @@ -1,11 +1,14 @@ // Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +use std::sync::Mutex; + use serde::{Deserialize, Serialize}; use crate::Kvm; use crate::arch::aarch64::gic::GicState; use crate::vstate::memory::{GuestMemoryExtension, GuestMemoryState}; +use crate::vstate::resources::ResourceAllocator; use crate::vstate::vm::{VmCommon, VmError}; /// Structure representing the current architecture's understand of what a "virtual machine" is. @@ -74,6 +77,7 @@ impl ArchVm { .get_irqchip() .save_device(mpidrs) .map_err(ArchVmError::SaveGic)?, + resource_allocator: self.resource_allocator().clone(), }) } @@ -86,6 +90,7 @@ impl ArchVm { self.get_irqchip() .restore_device(mpidrs, &state.gic) .map_err(ArchVmError::RestoreGic)?; + self.common.resource_allocator = Mutex::new(state.resource_allocator.clone()); Ok(()) } @@ -98,4 +103,6 @@ pub struct VmState { pub memory: GuestMemoryState, /// GIC state. pub gic: GicState, + /// resource allocator + pub resource_allocator: ResourceAllocator, } diff --git a/src/vmm/src/arch/mod.rs b/src/vmm/src/arch/mod.rs index ebd270a2e61..6d33ce461b9 100644 --- a/src/vmm/src/arch/mod.rs +++ b/src/vmm/src/arch/mod.rs @@ -20,10 +20,15 @@ pub use aarch64::vcpu::*; pub use aarch64::vm::{ArchVm, ArchVmError, VmState}; #[cfg(target_arch = "aarch64")] pub use aarch64::{ - ConfigurationError, MMIO_MEM_SIZE, MMIO_MEM_START, arch_memory_regions, - configure_system_for_boot, get_kernel_start, initrd_load_addr, layout::CMDLINE_MAX_SIZE, - layout::GSI_BASE, layout::GSI_MAX, layout::IRQ_BASE, layout::IRQ_MAX, layout::SYSTEM_MEM_SIZE, - layout::SYSTEM_MEM_START, load_kernel, + ConfigurationError, arch_memory_regions, configure_system_for_boot, get_kernel_start, + initrd_load_addr, layout::BOOT_DEVICE_MEM_START, layout::CMDLINE_MAX_SIZE, + layout::GSI_LEGACY_END, layout::GSI_LEGACY_NUM, layout::GSI_LEGACY_START, layout::GSI_MSI_END, + layout::GSI_MSI_NUM, layout::GSI_MSI_START, layout::MEM_32BIT_DEVICES_SIZE, + layout::MEM_32BIT_DEVICES_START, layout::MEM_64BIT_DEVICES_SIZE, + layout::MEM_64BIT_DEVICES_START, layout::MMIO32_MEM_SIZE, layout::MMIO32_MEM_START, + layout::PCI_MMCONFIG_SIZE, layout::PCI_MMCONFIG_START, + layout::PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, layout::RTC_MEM_START, layout::SERIAL_MEM_START, + layout::SPI_START, layout::SYSTEM_MEM_SIZE, layout::SYSTEM_MEM_START, load_kernel, }; /// Module for x86_64 related functionality. @@ -39,10 +44,14 @@ pub use x86_64::vm::{ArchVm, ArchVmError, VmState}; #[cfg(target_arch = "x86_64")] pub use crate::arch::x86_64::{ - ConfigurationError, MMIO_MEM_SIZE, MMIO_MEM_START, arch_memory_regions, - configure_system_for_boot, get_kernel_start, initrd_load_addr, layout::APIC_ADDR, - layout::CMDLINE_MAX_SIZE, layout::GSI_BASE, layout::GSI_MAX, layout::IOAPIC_ADDR, - layout::IRQ_BASE, layout::IRQ_MAX, layout::SYSTEM_MEM_SIZE, layout::SYSTEM_MEM_START, + ConfigurationError, arch_memory_regions, configure_system_for_boot, get_kernel_start, + initrd_load_addr, layout::APIC_ADDR, layout::BOOT_DEVICE_MEM_START, layout::CMDLINE_MAX_SIZE, + layout::GSI_LEGACY_END, layout::GSI_LEGACY_NUM, layout::GSI_LEGACY_START, layout::GSI_MSI_END, + layout::GSI_MSI_NUM, layout::GSI_MSI_START, layout::IOAPIC_ADDR, + layout::MEM_32BIT_DEVICES_SIZE, layout::MEM_32BIT_DEVICES_START, + layout::MEM_64BIT_DEVICES_SIZE, layout::MEM_64BIT_DEVICES_START, layout::MMIO32_MEM_SIZE, + layout::MMIO32_MEM_START, layout::PCI_MMCONFIG_SIZE, layout::PCI_MMCONFIG_START, + layout::PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, layout::SYSTEM_MEM_SIZE, layout::SYSTEM_MEM_START, load_kernel, }; @@ -115,3 +124,32 @@ pub struct EntryPoint { /// Specifies which boot protocol to use pub protocol: BootProtocol, } + +/// Adds in [`regions`] the valid memory regions suitable for RAM taking into account a gap in the +/// available address space and returns the remaining region (if any) past this gap +fn arch_memory_regions_with_gap( + regions: &mut Vec<(GuestAddress, usize)>, + region_start: usize, + region_size: usize, + gap_start: usize, + gap_size: usize, +) -> Option<(usize, usize)> { + // 0-sized gaps don't really make sense. We should never receive such a gap. + assert!(gap_size > 0); + + let first_addr_past_gap = gap_start + gap_size; + match (region_start + region_size).checked_sub(gap_start) { + // case0: region fits all before gap + None | Some(0) => { + regions.push((GuestAddress(region_start as u64), region_size)); + None + } + // case1: region starts before the gap and goes past it + Some(remaining) if region_start < gap_start => { + regions.push((GuestAddress(region_start as u64), gap_start - region_start)); + Some((first_addr_past_gap, remaining)) + } + // case2: region starts past the gap + Some(_) => Some((first_addr_past_gap.max(region_start), region_size)), + } +} diff --git a/src/vmm/src/arch/x86_64/layout.rs b/src/vmm/src/arch/x86_64/layout.rs index a4c2f036906..34ad343af2a 100644 --- a/src/vmm/src/arch/x86_64/layout.rs +++ b/src/vmm/src/arch/x86_64/layout.rs @@ -7,6 +7,9 @@ //! Magic addresses externally used to lay out x86_64 VMs. +use crate::device_manager::mmio::MMIO_LEN; +use crate::utils::mib_to_bytes; + /// Initial stack for the boot CPU. pub const BOOT_STACK_POINTER: u64 = 0x8ff0; @@ -18,17 +21,21 @@ pub const CMDLINE_MAX_SIZE: usize = 2048; /// Start of the high memory. pub const HIMEM_START: u64 = 0x0010_0000; // 1 MB. -// Typically, on x86 systems 24 IRQs are used (0-23). -/// First usable IRQ ID for virtio device interrupts on x86_64. -pub const IRQ_BASE: u32 = 5; -/// Last usable IRQ ID for virtio device interrupts on x86_64. -pub const IRQ_MAX: u32 = 23; - -/// The first usable GSI on x86_64 is the same as the first usable IRQ ID. -pub const GSI_BASE: u32 = IRQ_BASE; - -/// The maximum usable GSI on x86_64 is the same as the last usable IRQ ID. -pub const GSI_MAX: u32 = IRQ_MAX; +// Typically, on x86 systems 24 IRQs are used for legacy devices (0-23). +// However, the first 5 are reserved. +// We allocate the remaining GSIs to MSIs. +/// First usable GSI for legacy interrupts (IRQ) on x86_64. +pub const GSI_LEGACY_START: u32 = 5; +/// Last usable GSI for legacy interrupts (IRQ) on x86_64. +pub const GSI_LEGACY_END: u32 = 23; +/// Number of legacy GSI (IRQ) available on x86_64. +pub const GSI_LEGACY_NUM: u32 = GSI_LEGACY_END - GSI_LEGACY_START + 1; +/// First GSI used by MSI after legacy GSI. +pub const GSI_MSI_START: u32 = GSI_LEGACY_END + 1; +/// The highest available GSI in KVM (KVM_MAX_IRQ_ROUTES=4096). +pub const GSI_MSI_END: u32 = 4095; +/// Number of GSI available for MSI. +pub const GSI_MSI_NUM: u32 = GSI_MSI_END - GSI_MSI_START + 1; /// Address for the TSS setup. pub const KVM_TSS_ADDRESS: u64 = 0xfffb_d000; @@ -83,3 +90,45 @@ pub const SYSTEM_MEM_START: u64 = 0x9fc00; /// 257KiB is more than we need, however we reserve this space for potential future use of /// ACPI features (new tables and/or devices). pub const SYSTEM_MEM_SIZE: u64 = RSDP_ADDR - SYSTEM_MEM_START; + +/// First address that cannot be addressed using 32 bit anymore. +pub const FIRST_ADDR_PAST_32BITS: u64 = 1 << 32; + +/// The size of the memory area reserved for MMIO 32-bit accesses. +pub const MMIO32_MEM_SIZE: u64 = mib_to_bytes(1024) as u64; +/// The start of the memory area reserved for MMIO 32-bit accesses. +pub const MMIO32_MEM_START: u64 = FIRST_ADDR_PAST_32BITS - MMIO32_MEM_SIZE; + +// We dedicate the last 256 MiB of the 32-bit MMIO address space PCIe for memory-mapped access to +// configuration. +/// Size of MMIO region for PCIe configuration accesses. +pub const PCI_MMCONFIG_SIZE: u64 = 256 << 20; +/// Start of MMIO region for PCIe configuration accesses. +pub const PCI_MMCONFIG_START: u64 = IOAPIC_ADDR as u64 - PCI_MMCONFIG_SIZE; +/// MMIO space per PCIe segment +pub const PCI_MMIO_CONFIG_SIZE_PER_SEGMENT: u64 = 4096 * 256; + +// We reserve 768 MiB for devices at the beginning of the MMIO region. This includes space both for +// pure MMIO and PCIe devices. + +/// Memory region start for boot device. +pub const BOOT_DEVICE_MEM_START: u64 = MMIO32_MEM_START; + +/// Beginning of memory region for device MMIO 32-bit accesses +pub const MEM_32BIT_DEVICES_START: u64 = BOOT_DEVICE_MEM_START + MMIO_LEN; +/// Size of memory region for device MMIO 32-bit accesses +pub const MEM_32BIT_DEVICES_SIZE: u64 = PCI_MMCONFIG_START - MEM_32BIT_DEVICES_START; + +// 64-bits region for MMIO accesses +/// The start of the memory area reserved for MMIO 64-bit accesses. +pub const MMIO64_MEM_START: u64 = 256 << 30; +/// The size of the memory area reserved for MMIO 64-bit accesses. +pub const MMIO64_MEM_SIZE: u64 = 256 << 30; + +// At the moment, all of this region goes to devices +/// Beginning of memory region for device MMIO 64-bit accesses +pub const MEM_64BIT_DEVICES_START: u64 = MMIO64_MEM_START; +/// Size of memory region for device MMIO 32-bit accesses +pub const MEM_64BIT_DEVICES_SIZE: u64 = MMIO64_MEM_SIZE; +/// First address past the 64-bit MMIO gap +pub const FIRST_ADDR_PAST_64BITS_MMIO: u64 = MMIO64_MEM_START + MMIO64_MEM_SIZE; diff --git a/src/vmm/src/arch/x86_64/mod.rs b/src/vmm/src/arch/x86_64/mod.rs index ca350cbf9af..1822abb9009 100644 --- a/src/vmm/src/arch/x86_64/mod.rs +++ b/src/vmm/src/arch/x86_64/mod.rs @@ -33,7 +33,11 @@ pub mod generated; use std::fs::File; -use layout::CMDLINE_START; +use kvm::Kvm; +use layout::{ + CMDLINE_START, FIRST_ADDR_PAST_32BITS, FIRST_ADDR_PAST_64BITS_MMIO, MMIO32_MEM_SIZE, + MMIO32_MEM_START, MMIO64_MEM_SIZE, MMIO64_MEM_START, PCI_MMCONFIG_SIZE, PCI_MMCONFIG_START, +}; use linux_loader::configurator::linux::LinuxBootConfigurator; use linux_loader::configurator::pvh::PvhBootConfigurator; use linux_loader::configurator::{BootConfigurator, BootParams}; @@ -47,17 +51,18 @@ use log::debug; use super::EntryPoint; use crate::acpi::create_acpi_tables; -use crate::arch::{BootProtocol, SYSTEM_MEM_SIZE, SYSTEM_MEM_START}; +use crate::arch::{BootProtocol, SYSTEM_MEM_SIZE, SYSTEM_MEM_START, arch_memory_regions_with_gap}; use crate::cpu_config::templates::{CustomCpuTemplate, GuestConfigError}; use crate::cpu_config::x86_64::CpuConfiguration; +use crate::device_manager::DeviceManager; use crate::initrd::InitrdConfig; -use crate::utils::{align_down, mib_to_bytes, u64_to_usize, usize_to_u64}; +use crate::utils::{align_down, u64_to_usize, usize_to_u64}; use crate::vmm_config::machine_config::MachineConfig; use crate::vstate::memory::{ Address, GuestAddress, GuestMemory, GuestMemoryMmap, GuestMemoryRegion, }; use crate::vstate::vcpu::KvmVcpuConfigureError; -use crate::{Vcpu, VcpuConfig, Vmm}; +use crate::{Vcpu, VcpuConfig, Vm, logger}; // Value taken from https://elixir.bootlin.com/linux/v5.10.68/source/arch/x86/include/uapi/asm/e820.h#L31 // Usable normal RAM @@ -96,48 +101,53 @@ pub enum ConfigurationError { Acpi(#[from] crate::acpi::AcpiError), } -/// First address that cannot be addressed using 32 bit anymore. -pub const FIRST_ADDR_PAST_32BITS: u64 = 1 << 32; - -/// Size of MMIO gap at top of 32-bit address space. -pub const MEM_32BIT_GAP_SIZE: u64 = mib_to_bytes(768) as u64; -/// The start of the memory area reserved for MMIO devices. -pub const MMIO_MEM_START: u64 = FIRST_ADDR_PAST_32BITS - MEM_32BIT_GAP_SIZE; -/// The size of the memory area reserved for MMIO devices. -pub const MMIO_MEM_SIZE: u64 = MEM_32BIT_GAP_SIZE; - /// Returns a Vec of the valid memory addresses. /// These should be used to configure the GuestMemoryMmap structure for the platform. -/// For x86_64 all addresses are valid from the start of the kernel except a -/// carve out at the end of 32bit address space. -pub fn arch_memory_regions(offset: usize, size: usize) -> Vec<(GuestAddress, usize)> { +/// For x86_64 all addresses are valid from the start of the kernel except an 1GB +/// carve out at the end of 32bit address space and a second 256GB one at the 256GB limit. +pub fn arch_memory_regions(size: usize) -> Vec<(GuestAddress, usize)> { // If we get here with size == 0 something has seriously gone wrong. Firecracker should never // try to allocate guest memory of size 0 assert!(size > 0, "Attempt to allocate guest memory of length 0"); - assert!( - offset.checked_add(size).is_some(), - "Attempt to allocate guest memory such that the address space would wrap around" + + let dram_size = std::cmp::min( + usize::MAX - u64_to_usize(MMIO32_MEM_SIZE) - u64_to_usize(MMIO64_MEM_SIZE), + size, ); - // It's safe to cast MMIO_MEM_START to usize because it fits in a u32 variable - // (It points to an address in the 32 bit space). - match (size + offset).checked_sub(u64_to_usize(MMIO_MEM_START)) { - // case1: guest memory fits before the gap - None | Some(0) => vec![(GuestAddress(offset as u64), size)], - // case2: starts before the gap, but doesn't completely fit - Some(remaining) if (offset as u64) < MMIO_MEM_START => vec![ - ( - GuestAddress(offset as u64), - u64_to_usize(MMIO_MEM_START) - offset, - ), - (GuestAddress(FIRST_ADDR_PAST_32BITS), remaining), - ], - // case3: guest memory start after the gap - Some(_) => vec![( - GuestAddress(FIRST_ADDR_PAST_32BITS.max(offset as u64)), + if dram_size != size { + logger::warn!( + "Requested memory size {} exceeds architectural maximum (1022GiB). Size has been \ + truncated to {}", size, - )], + dram_size + ); } + + let mut regions = vec![]; + + if let Some((start_past_32bit_gap, remaining_past_32bit_gap)) = arch_memory_regions_with_gap( + &mut regions, + 0, + dram_size, + u64_to_usize(MMIO32_MEM_START), + u64_to_usize(MMIO32_MEM_SIZE), + ) { + if let Some((start_past_64bit_gap, remaining_past_64bit_gap)) = arch_memory_regions_with_gap( + &mut regions, + start_past_32bit_gap, + remaining_past_32bit_gap, + u64_to_usize(MMIO64_MEM_START), + u64_to_usize(MMIO64_MEM_SIZE), + ) { + regions.push(( + GuestAddress(start_past_64bit_gap as u64), + remaining_past_64bit_gap, + )); + } + } + + regions } /// Returns the memory address where the kernel could be loaded. @@ -161,8 +171,11 @@ pub fn initrd_load_addr(guest_mem: &GuestMemoryMmap, initrd_size: usize) -> Opti } /// Configures the system for booting Linux. +#[allow(clippy::too_many_arguments)] pub fn configure_system_for_boot( - vmm: &mut Vmm, + kvm: &Kvm, + vm: &Vm, + device_manager: &mut DeviceManager, vcpus: &mut [Vcpu], machine_config: &MachineConfig, cpu_template: &CustomCpuTemplate, @@ -171,8 +184,7 @@ pub fn configure_system_for_boot( boot_cmdline: Cmdline, ) -> Result<(), ConfigurationError> { // Construct the base CpuConfiguration to apply CPU template onto. - let cpu_config = - CpuConfiguration::new(vmm.kvm.supported_cpuid.clone(), cpu_template, &vcpus[0])?; + let cpu_config = CpuConfiguration::new(kvm.supported_cpuid.clone(), cpu_template, &vcpus[0])?; // Apply CPU template to the base CpuConfiguration. let cpu_config = CpuConfiguration::apply_template(cpu_config, cpu_template)?; @@ -185,7 +197,7 @@ pub fn configure_system_for_boot( // Configure vCPUs with normalizing and setting the generated CPU configuration. for vcpu in vcpus.iter_mut() { vcpu.kvm_vcpu - .configure(vmm.vm.guest_memory(), entry_point, &vcpu_config)?; + .configure(vm.guest_memory(), entry_point, &vcpu_config)?; } // Write the kernel command line to guest memory. This is x86_64 specific, since on @@ -196,7 +208,7 @@ pub fn configure_system_for_boot( .expect("Cannot create cstring from cmdline string"); load_cmdline( - vmm.vm.guest_memory(), + vm.guest_memory(), GuestAddress(crate::arch::x86_64::layout::CMDLINE_START), &boot_cmdline, ) @@ -204,19 +216,19 @@ pub fn configure_system_for_boot( // Note that this puts the mptable at the last 1k of Linux's 640k base RAM mptable::setup_mptable( - vmm.vm.guest_memory(), - &mut vmm.resource_allocator, + vm.guest_memory(), + &mut vm.resource_allocator(), vcpu_config.vcpu_count, ) .map_err(ConfigurationError::MpTableSetup)?; match entry_point.protocol { BootProtocol::PvhBoot => { - configure_pvh(vmm.vm.guest_memory(), GuestAddress(CMDLINE_START), initrd)?; + configure_pvh(vm.guest_memory(), GuestAddress(CMDLINE_START), initrd)?; } BootProtocol::LinuxBoot => { configure_64bit_boot( - vmm.vm.guest_memory(), + vm.guest_memory(), GuestAddress(CMDLINE_START), cmdline_size, initrd, @@ -227,10 +239,9 @@ pub fn configure_system_for_boot( // Create ACPI tables and write them in guest memory // For the time being we only support ACPI in x86_64 create_acpi_tables( - vmm.vm.guest_memory(), - &mut vmm.resource_allocator, - &vmm.mmio_device_manager, - &vmm.acpi_device_manager, + vm.guest_memory(), + device_manager, + &mut vm.resource_allocator(), vcpus, )?; Ok(()) @@ -243,7 +254,9 @@ fn configure_pvh( ) -> Result<(), ConfigurationError> { const XEN_HVM_START_MAGIC_VALUE: u32 = 0x336e_c578; let first_addr_past_32bits = GuestAddress(FIRST_ADDR_PAST_32BITS); - let end_32bit_gap_start = GuestAddress(MMIO_MEM_START); + let end_32bit_gap_start = GuestAddress(MMIO32_MEM_START); + let first_addr_past_64bits = GuestAddress(FIRST_ADDR_PAST_64BITS_MMIO); + let end_64bit_gap_start = GuestAddress(MMIO64_MEM_START); let himem_start = GuestAddress(layout::HIMEM_START); // Vector to hold modules (currently either empty or holding initrd). @@ -275,32 +288,42 @@ fn configure_pvh( type_: E820_RESERVED, ..Default::default() }); + memmap.push(hvm_memmap_table_entry { + addr: PCI_MMCONFIG_START, + size: PCI_MMCONFIG_SIZE, + type_: E820_RESERVED, + ..Default::default() + }); let last_addr = guest_mem.last_addr(); - if last_addr < end_32bit_gap_start { + + if last_addr > first_addr_past_64bits { memmap.push(hvm_memmap_table_entry { - addr: himem_start.raw_value(), - size: last_addr.unchecked_offset_from(himem_start) + 1, + addr: first_addr_past_64bits.raw_value(), + size: last_addr.unchecked_offset_from(first_addr_past_64bits) + 1, type_: MEMMAP_TYPE_RAM, ..Default::default() }); - } else { + } + + if last_addr > first_addr_past_32bits { memmap.push(hvm_memmap_table_entry { - addr: himem_start.raw_value(), - size: end_32bit_gap_start.unchecked_offset_from(himem_start), + addr: first_addr_past_32bits.raw_value(), + size: (end_64bit_gap_start.unchecked_offset_from(first_addr_past_32bits)) + .min(last_addr.unchecked_offset_from(first_addr_past_32bits) + 1), type_: MEMMAP_TYPE_RAM, ..Default::default() }); - - if last_addr > first_addr_past_32bits { - memmap.push(hvm_memmap_table_entry { - addr: first_addr_past_32bits.raw_value(), - size: last_addr.unchecked_offset_from(first_addr_past_32bits) + 1, - type_: MEMMAP_TYPE_RAM, - ..Default::default() - }); - } } + memmap.push(hvm_memmap_table_entry { + addr: himem_start.raw_value(), + size: end_32bit_gap_start + .unchecked_offset_from(himem_start) + .min(last_addr.unchecked_offset_from(himem_start) + 1), + type_: MEMMAP_TYPE_RAM, + ..Default::default() + }); + // Construct the hvm_start_info structure and serialize it into // boot_params. This will be stored at PVH_INFO_START address, and %rbx // will be initialized to contain PVH_INFO_START prior to starting the @@ -346,7 +369,9 @@ fn configure_64bit_boot( const KERNEL_LOADER_OTHER: u8 = 0xff; const KERNEL_MIN_ALIGNMENT_BYTES: u32 = 0x0100_0000; // Must be non-zero. let first_addr_past_32bits = GuestAddress(FIRST_ADDR_PAST_32BITS); - let end_32bit_gap_start = GuestAddress(MMIO_MEM_START); + let end_32bit_gap_start = GuestAddress(MMIO32_MEM_START); + let first_addr_past_64bits = GuestAddress(FIRST_ADDR_PAST_64BITS_MMIO); + let end_64bit_gap_start = GuestAddress(MMIO64_MEM_START); let himem_start = GuestAddress(layout::HIMEM_START); @@ -377,39 +402,42 @@ fn configure_64bit_boot( layout::SYSTEM_MEM_SIZE, E820_RESERVED, )?; + add_e820_entry( + &mut params, + PCI_MMCONFIG_START, + PCI_MMCONFIG_SIZE, + E820_RESERVED, + )?; let last_addr = guest_mem.last_addr(); - if last_addr < end_32bit_gap_start { + + if last_addr > first_addr_past_64bits { add_e820_entry( &mut params, - himem_start.raw_value(), - // it's safe to use unchecked_offset_from because - // mem_end > himem_start - last_addr.unchecked_offset_from(himem_start) + 1, + first_addr_past_64bits.raw_value(), + last_addr.unchecked_offset_from(first_addr_past_64bits) + 1, E820_RAM, )?; - } else { + } + + if last_addr > first_addr_past_32bits { add_e820_entry( &mut params, - himem_start.raw_value(), - // it's safe to use unchecked_offset_from because - // end_32bit_gap_start > himem_start - end_32bit_gap_start.unchecked_offset_from(himem_start), + first_addr_past_32bits.raw_value(), + (end_64bit_gap_start.unchecked_offset_from(first_addr_past_32bits)) + .min(last_addr.unchecked_offset_from(first_addr_past_32bits) + 1), E820_RAM, )?; - - if last_addr > first_addr_past_32bits { - add_e820_entry( - &mut params, - first_addr_past_32bits.raw_value(), - // it's safe to use unchecked_offset_from because - // mem_end > first_addr_past_32bits - last_addr.unchecked_offset_from(first_addr_past_32bits) + 1, - E820_RAM, - )?; - } } + add_e820_entry( + &mut params, + himem_start.raw_value(), + (last_addr.unchecked_offset_from(himem_start) + 1) + .min(end_32bit_gap_start.unchecked_offset_from(himem_start)), + E820_RAM, + )?; + LinuxBootConfigurator::write_bootparams( &BootParams::new(¶ms, GuestAddress(layout::ZERO_PAGE_START)), guest_mem, @@ -474,51 +502,69 @@ pub fn load_kernel( #[cfg(kani)] mod verification { - use crate::arch::x86_64::FIRST_ADDR_PAST_32BITS; - use crate::arch::{MMIO_MEM_START, arch_memory_regions}; + + use crate::arch::arch_memory_regions; + use crate::arch::x86_64::layout::{ + FIRST_ADDR_PAST_32BITS, FIRST_ADDR_PAST_64BITS_MMIO, MMIO32_MEM_SIZE, MMIO32_MEM_START, + MMIO64_MEM_SIZE, MMIO64_MEM_START, + }; + use crate::utils::u64_to_usize; #[kani::proof] - #[kani::unwind(3)] + #[kani::unwind(4)] fn verify_arch_memory_regions() { - let offset: u64 = kani::any::(); let len: u64 = kani::any::(); kani::assume(len > 0); - kani::assume(offset.checked_add(len).is_some()); - let regions = arch_memory_regions(offset as usize, len as usize); + let regions = arch_memory_regions(len as usize); - // There's only one MMIO gap, so we can get either 1 or 2 regions - assert!(regions.len() <= 2); + // There are two MMIO gaps, so we can get either 1, 2 or 3 regions + assert!(regions.len() <= 3); assert!(regions.len() >= 1); + // The first address is always 0 + assert_eq!(regions[0].0.0, 0); + // The total length of all regions is what we requested - assert_eq!( - regions.iter().map(|&(_, len)| len).sum::(), - len as usize - ); + let actual_size = regions.iter().map(|&(_, len)| len).sum::(); + assert!(actual_size <= len as usize); + if actual_size < u64_to_usize(len) { + assert_eq!( + actual_size, + usize::MAX - u64_to_usize(MMIO32_MEM_SIZE) - u64_to_usize(MMIO64_MEM_SIZE) + ); + } // No region overlaps the MMIO gap assert!( regions .iter() - .all(|&(start, len)| start.0 >= FIRST_ADDR_PAST_32BITS - || start.0 + len as u64 <= MMIO_MEM_START) + .all(|&(start, len)| (start.0 >= FIRST_ADDR_PAST_32BITS + || start.0 + len as u64 <= MMIO32_MEM_START) + && (start.0 >= FIRST_ADDR_PAST_64BITS_MMIO + || start.0 + len as u64 <= MMIO64_MEM_START)) ); - // All regions start after our specified offset - assert!(regions.iter().all(|&(start, _)| start.0 >= offset as u64)); - // All regions have non-zero length assert!(regions.iter().all(|&(_, len)| len > 0)); - // If there's two regions, they perfectly snuggle up to the MMIO gap - if regions.len() == 2 { + // If there's at least two regions, they perfectly snuggle up to one of the two MMIO gaps + if regions.len() >= 2 { kani::cover!(); - assert_eq!(regions[0].0.0 + regions[0].1 as u64, MMIO_MEM_START); + assert_eq!(regions[0].0.0 + regions[0].1 as u64, MMIO32_MEM_START); assert_eq!(regions[1].0.0, FIRST_ADDR_PAST_32BITS); } + + // If there are three regions, the last two perfectly snuggle up to the 64bit + // MMIO gap + if regions.len() == 3 { + kani::cover!(); + + assert_eq!(regions[1].0.0 + regions[1].1 as u64, MMIO64_MEM_START); + assert_eq!(regions[2].0.0, FIRST_ADDR_PAST_64BITS_MMIO); + } } } @@ -527,39 +573,27 @@ mod tests { use linux_loader::loader::bootparam::boot_e820_entry; use super::*; - use crate::device_manager::resources::ResourceAllocator; use crate::test_utils::{arch_mem, single_region_mem}; + use crate::utils::mib_to_bytes; + use crate::vstate::resources::ResourceAllocator; #[test] fn regions_lt_4gb() { - let regions = arch_memory_regions(0, 1usize << 29); + let regions = arch_memory_regions(1usize << 29); assert_eq!(1, regions.len()); assert_eq!(GuestAddress(0), regions[0].0); assert_eq!(1usize << 29, regions[0].1); - - let regions = arch_memory_regions(1 << 28, 1 << 29); - assert_eq!(1, regions.len()); - assert_eq!(regions[0], (GuestAddress(1 << 28), 1 << 29)); } #[test] fn regions_gt_4gb() { const MEMORY_SIZE: usize = (1 << 32) + 0x8000; - let regions = arch_memory_regions(0, MEMORY_SIZE); + let regions = arch_memory_regions(MEMORY_SIZE); assert_eq!(2, regions.len()); assert_eq!(GuestAddress(0), regions[0].0); assert_eq!(GuestAddress(1u64 << 32), regions[1].0); - let regions = arch_memory_regions(1 << 31, MEMORY_SIZE); - assert_eq!(2, regions.len()); - assert_eq!( - regions[0], - ( - GuestAddress(1 << 31), - u64_to_usize(MMIO_MEM_START) - (1 << 31) - ) - ); assert_eq!( regions[1], ( @@ -573,7 +607,7 @@ mod tests { fn test_system_configuration() { let no_vcpus = 4; let gm = single_region_mem(0x10000); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); let err = mptable::setup_mptable(&gm, &mut resource_allocator, 1); assert!(matches!( err.unwrap_err(), @@ -583,7 +617,7 @@ mod tests { // Now assigning some memory that falls before the 32bit memory hole. let mem_size = mib_to_bytes(128); let gm = arch_mem(mem_size); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); mptable::setup_mptable(&gm, &mut resource_allocator, no_vcpus).unwrap(); configure_64bit_boot(&gm, GuestAddress(0), 0, &None).unwrap(); configure_pvh(&gm, GuestAddress(0), &None).unwrap(); @@ -591,7 +625,7 @@ mod tests { // Now assigning some memory that is equal to the start of the 32bit memory hole. let mem_size = mib_to_bytes(3328); let gm = arch_mem(mem_size); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); mptable::setup_mptable(&gm, &mut resource_allocator, no_vcpus).unwrap(); configure_64bit_boot(&gm, GuestAddress(0), 0, &None).unwrap(); configure_pvh(&gm, GuestAddress(0), &None).unwrap(); @@ -599,7 +633,7 @@ mod tests { // Now assigning some memory that falls after the 32bit memory hole. let mem_size = mib_to_bytes(3330); let gm = arch_mem(mem_size); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); mptable::setup_mptable(&gm, &mut resource_allocator, no_vcpus).unwrap(); configure_64bit_boot(&gm, GuestAddress(0), 0, &None).unwrap(); configure_pvh(&gm, GuestAddress(0), &None).unwrap(); diff --git a/src/vmm/src/arch/x86_64/mptable.rs b/src/vmm/src/arch/x86_64/mptable.rs index 6646c17e282..99fb202c8d8 100644 --- a/src/vmm/src/arch/x86_64/mptable.rs +++ b/src/vmm/src/arch/x86_64/mptable.rs @@ -13,12 +13,12 @@ use libc::c_char; use log::debug; use vm_allocator::AllocPolicy; -use crate::arch::IRQ_MAX; +use crate::arch::GSI_LEGACY_END; use crate::arch::x86_64::generated::mpspec; -use crate::device_manager::resources::ResourceAllocator; use crate::vstate::memory::{ Address, ByteValued, Bytes, GuestAddress, GuestMemory, GuestMemoryMmap, }; +use crate::vstate::resources::ResourceAllocator; // These `mpspec` wrapper types are only data, reading them from data is a safe initialization. // SAFETY: POD @@ -109,7 +109,7 @@ fn compute_mp_size(num_cpus: u8) -> usize { + mem::size_of::() * (num_cpus as usize) + mem::size_of::() + mem::size_of::() - + mem::size_of::() * (IRQ_MAX as usize + 1) + + mem::size_of::() * (GSI_LEGACY_END as usize + 1) + mem::size_of::() * 2 } @@ -225,7 +225,7 @@ pub fn setup_mptable( mp_num_entries += 1; } // Per kvm_setup_default_irq_routing() in kernel - for i in 0..=u8::try_from(IRQ_MAX).map_err(|_| MptableError::TooManyIrqs)? { + for i in 0..=u8::try_from(GSI_LEGACY_END).map_err(|_| MptableError::TooManyIrqs)? { let size = mem::size_of::() as u64; let mpc_intsrc = mpspec::mpc_intsrc { type_: mpspec::MP_INTSRC.try_into().unwrap(), @@ -334,7 +334,7 @@ mod tests { fn bounds_check() { let num_cpus = 4; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(num_cpus)); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); setup_mptable(&mem, &mut resource_allocator, num_cpus).unwrap(); } @@ -343,7 +343,7 @@ mod tests { fn bounds_check_fails() { let num_cpus = 4; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(num_cpus) - 1); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); setup_mptable(&mem, &mut resource_allocator, num_cpus).unwrap_err(); } @@ -352,7 +352,7 @@ mod tests { fn mpf_intel_checksum() { let num_cpus = 1; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(num_cpus)); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); setup_mptable(&mem, &mut resource_allocator, num_cpus).unwrap(); @@ -365,7 +365,7 @@ mod tests { fn mpc_table_checksum() { let num_cpus = 4; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(num_cpus)); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); setup_mptable(&mem, &mut resource_allocator, num_cpus).unwrap(); @@ -388,7 +388,7 @@ mod tests { fn mpc_entry_count() { let num_cpus = 1; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(num_cpus)); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); setup_mptable(&mem, &mut resource_allocator, num_cpus).unwrap(); @@ -406,7 +406,7 @@ mod tests { // ISA Bus + 1 // IRQ - + u16::try_from(IRQ_MAX).unwrap() + 1 + + u16::try_from(GSI_LEGACY_END).unwrap() + 1 // Interrupt source ExtINT + 1 // Interrupt source NMI @@ -419,7 +419,8 @@ mod tests { let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(MAX_SUPPORTED_CPUS)); for i in 0..MAX_SUPPORTED_CPUS { - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); + setup_mptable(&mem, &mut resource_allocator, i).unwrap(); let mpf_intel: mpspec::mpf_intel = @@ -450,7 +451,7 @@ mod tests { fn cpu_entry_count_max() { let cpus = MAX_SUPPORTED_CPUS + 1; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(cpus)); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); let result = setup_mptable(&mem, &mut resource_allocator, cpus).unwrap_err(); assert_eq!(result, MptableError::TooManyCpus); diff --git a/src/vmm/src/arch/x86_64/vcpu.rs b/src/vmm/src/arch/x86_64/vcpu.rs index b46d8e07b59..eea1f24ae69 100644 --- a/src/vmm/src/arch/x86_64/vcpu.rs +++ b/src/vmm/src/arch/x86_64/vcpu.rs @@ -7,6 +7,7 @@ use std::collections::BTreeMap; use std::fmt::Debug; +use std::sync::Arc; use kvm_bindings::{ CpuId, KVM_MAX_CPUID_ENTRIES, KVM_MAX_MSR_ENTRIES, Msrs, Xsave, kvm_debugregs, kvm_lapic_state, @@ -159,9 +160,9 @@ pub struct KvmVcpu { #[derive(Default, Debug)] pub struct Peripherals { /// Pio bus. - pub pio_bus: Option, + pub pio_bus: Option>, /// Mmio bus. - pub mmio_bus: Option, + pub mmio_bus: Option>, } impl KvmVcpu { @@ -266,7 +267,7 @@ impl KvmVcpu { } /// Sets a Port Mapped IO bus for this vcpu. - pub fn set_pio_bus(&mut self, pio_bus: crate::devices::Bus) { + pub fn set_pio_bus(&mut self, pio_bus: Arc) { self.peripherals.pio_bus = Some(pio_bus); } @@ -710,7 +711,9 @@ impl Peripherals { VcpuExit::IoIn(addr, data) => { if let Some(pio_bus) = &self.pio_bus { let _metric = METRICS.vcpu.exit_io_in_agg.record_latency_metrics(); - pio_bus.read(u64::from(addr), data); + if let Err(err) = pio_bus.read(u64::from(addr), data) { + warn!("vcpu: IO read @ {addr:#x}:{:#x} failed: {err}", data.len()); + } METRICS.vcpu.exit_io_in.inc(); } Ok(VcpuEmulation::Handled) @@ -718,7 +721,9 @@ impl Peripherals { VcpuExit::IoOut(addr, data) => { if let Some(pio_bus) = &self.pio_bus { let _metric = METRICS.vcpu.exit_io_out_agg.record_latency_metrics(); - pio_bus.write(u64::from(addr), data); + if let Err(err) = pio_bus.write(u64::from(addr), data) { + warn!("vcpu: IO write @ {addr:#x}:{:#x} failed: {err}", data.len()); + } METRICS.vcpu.exit_io_out.inc(); } Ok(VcpuEmulation::Handled) diff --git a/src/vmm/src/arch/x86_64/vm.rs b/src/vmm/src/arch/x86_64/vm.rs index e84b4338e35..e194296928d 100644 --- a/src/vmm/src/arch/x86_64/vm.rs +++ b/src/vmm/src/arch/x86_64/vm.rs @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use std::fmt; +use std::sync::{Arc, Mutex}; use kvm_bindings::{ KVM_CLOCK_TSC_STABLE, KVM_IRQCHIP_IOAPIC, KVM_IRQCHIP_PIC_MASTER, KVM_IRQCHIP_PIC_SLAVE, @@ -11,8 +12,10 @@ use kvm_ioctls::Cap; use serde::{Deserialize, Serialize}; use crate::arch::x86_64::msr::MsrError; +use crate::snapshot::Persist; use crate::utils::u64_to_usize; use crate::vstate::memory::{GuestMemoryExtension, GuestMemoryState}; +use crate::vstate::resources::ResourceAllocator; use crate::vstate::vm::{VmCommon, VmError}; /// Error type for [`Vm::restore_state`] @@ -56,6 +59,8 @@ pub struct ArchVm { /// /// `None` if `KVM_CAP_XSAVE2` not supported. xsave2_size: Option, + /// Port IO bus + pub pio_bus: Arc, } impl ArchVm { @@ -90,10 +95,13 @@ impl ArchVm { .set_tss_address(u64_to_usize(crate::arch::x86_64::layout::KVM_TSS_ADDRESS)) .map_err(ArchVmError::SetTssAddress)?; + let pio_bus = Arc::new(vm_device::Bus::new()); + Ok(ArchVm { common, msrs_to_save, xsave2_size, + pio_bus, }) } @@ -134,6 +142,7 @@ impl ArchVm { self.fd() .set_irqchip(&state.ioapic) .map_err(ArchVmError::SetIrqChipIoAPIC)?; + self.common.resource_allocator = Mutex::new(state.resource_allocator.clone()); Ok(()) } @@ -187,6 +196,7 @@ impl ArchVm { Ok(VmState { memory: self.common.guest_memory.describe(), + resource_allocator: self.resource_allocator().save(), pitstate, clock, pic_master, @@ -211,6 +221,8 @@ impl ArchVm { pub struct VmState { /// guest memory state pub memory: GuestMemoryState, + /// resource allocator + pub resource_allocator: ResourceAllocator, pitstate: kvm_pit_state2, clock: kvm_clock_data, // TODO: rename this field to adopt inclusive language once Linux updates it, too. diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 74f03e6b111..88d7f56cb4e 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -9,70 +9,61 @@ use std::io; use std::sync::mpsc; use std::sync::{Arc, Mutex}; -use event_manager::{MutEventSubscriber, SubscriberOps}; -use libc::EFD_NONBLOCK; +use event_manager::SubscriberOps; use linux_loader::cmdline::Cmdline as LoaderKernelCmdline; use userfaultfd::Uffd; use utils::time::TimestampUs; #[cfg(target_arch = "aarch64")] use vm_memory::GuestAddress; -#[cfg(target_arch = "aarch64")] -use vm_superio::Rtc; -use vm_superio::Serial; -use vmm_sys_util::eventfd::EventFd; +#[cfg(target_arch = "aarch64")] +use crate::Vcpu; use crate::arch::{ConfigurationError, configure_system_for_boot, load_kernel}; #[cfg(target_arch = "aarch64")] use crate::construct_kvm_mpidrs; -use crate::cpu_config::templates::{ - GetCpuTemplate, GetCpuTemplateError, GuestConfigError, KvmCapability, -}; -use crate::device_manager::acpi::ACPIDeviceManager; +use crate::cpu_config::templates::{GetCpuTemplate, GetCpuTemplateError, GuestConfigError}; #[cfg(target_arch = "x86_64")] -use crate::device_manager::legacy::PortIODeviceManager; -use crate::device_manager::mmio::{MMIODeviceManager, MmioError}; -use crate::device_manager::persist::{ - ACPIDeviceManagerConstructorArgs, ACPIDeviceManagerRestoreError, MMIODevManagerConstructorArgs, +use crate::device_manager; +use crate::device_manager::pci_mngr::PciManagerError; +use crate::device_manager::{ + AttachDeviceError, DeviceManager, DeviceManagerCreateError, DevicePersistError, + DeviceRestoreArgs, }; -use crate::device_manager::resources::ResourceAllocator; -use crate::devices::BusDevice; -use crate::devices::acpi::vmgenid::{VmGenId, VmGenIdError}; -#[cfg(target_arch = "aarch64")] -use crate::devices::legacy::RTCDevice; -use crate::devices::legacy::serial::SerialOut; -use crate::devices::legacy::{EventFdTrigger, SerialEventsWrapper, SerialWrapper}; +use crate::devices::acpi::vmgenid::VmGenIdError; use crate::devices::virtio::balloon::Balloon; use crate::devices::virtio::block::device::Block; -use crate::devices::virtio::device::VirtioDevice; -use crate::devices::virtio::mmio::MmioTransport; use crate::devices::virtio::net::Net; use crate::devices::virtio::rng::Entropy; use crate::devices::virtio::vsock::{Vsock, VsockUnixBackend}; #[cfg(feature = "gdb")] use crate::gdb; use crate::initrd::{InitrdConfig, InitrdError}; -use crate::logger::{debug, error}; +use crate::logger::debug; use crate::persist::{MicrovmState, MicrovmStateError}; use crate::resources::VmResources; use crate::seccomp::BpfThreadMap; use crate::snapshot::Persist; use crate::vmm_config::instance_info::InstanceInfo; use crate::vmm_config::machine_config::MachineConfigError; -use crate::vstate::kvm::Kvm; +use crate::vstate::kvm::{Kvm, KvmError}; use crate::vstate::memory::GuestRegionMmap; -use crate::vstate::vcpu::{Vcpu, VcpuError}; -use crate::vstate::vm::Vm; -use crate::{EventManager, Vmm, VmmError, device_manager}; +#[cfg(target_arch = "aarch64")] +use crate::vstate::resources::ResourceAllocator; +use crate::vstate::vcpu::VcpuError; +use crate::vstate::vm::{Vm, VmError}; +use crate::{EventManager, Vmm, VmmError}; /// Errors associated with starting the instance. #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum StartMicrovmError { /// Unable to attach block device to Vmm: {0} AttachBlockDevice(io::Error), - /// Unable to attach the VMGenID device: {0} - AttachVmgenidDevice(kvm_ioctls::Error), + /// Could not attach device: {0} + AttachDevice(#[from] AttachDeviceError), /// System configuration error: {0} ConfigureSystem(#[from] ConfigurationError), + /// Failed to create device manager: {0} + CreateDeviceManager(#[from] DeviceManagerCreateError), /// Failed to create guest config: {0} CreateGuestConfig(#[from] GuestConfigError), /// Cannot create network device: {0} @@ -84,6 +75,8 @@ pub enum StartMicrovmError { CreateLegacyDevice(device_manager::legacy::LegacyDeviceError), /// Error creating VMGenID device: {0} CreateVMGenID(VmGenIdError), + /// Error enabling PCIe support: {0} + EnablePciDevices(#[from] PciManagerError), /// Error enabling pvtime on vcpu: {0} #[cfg(target_arch = "aarch64")] EnablePVTime(crate::arch::VcpuArchError), @@ -97,6 +90,8 @@ pub enum StartMicrovmError { GetCpuTemplate(#[from] GetCpuTemplateError), /// Invalid kernel command line: {0} KernelCmdline(String), + /// Kvm error: {0} + Kvm(#[from] KvmError), /// Cannot load command line string: {0} LoadCommandline(linux_loader::loader::Error), /// Cannot start microvm without kernel configuration. @@ -109,8 +104,6 @@ pub enum StartMicrovmError { NetDeviceNotConfigured, /// Cannot open the block device backing file: {0} OpenBlockDevice(io::Error), - /// Cannot initialize a MMIO Device or add a device to the MMIO Bus or cmdline: {0} - RegisterMmioDevice(#[from] device_manager::mmio::MmioError), /// Cannot restore microvm state: {0} RestoreMicrovmState(MicrovmStateError), /// Cannot set vm resources: {0} @@ -125,6 +118,8 @@ pub enum StartMicrovmError { /// Error cloning Vcpu fds #[cfg(feature = "gdb")] VcpuFdCloneError(#[from] crate::vstate::vcpu::CopyKvmFdError), + /// Error with the Vm object: {0} + Vm(#[from] VmError), } /// It's convenient to automatically convert `linux_loader::cmdline::Error`s @@ -135,67 +130,6 @@ impl std::convert::From for StartMicrovmError { } } -#[cfg_attr(target_arch = "aarch64", allow(unused))] -fn create_vmm_and_vcpus( - instance_info: &InstanceInfo, - event_manager: &mut EventManager, - vcpu_count: u8, - kvm_capabilities: Vec, -) -> Result<(Vmm, Vec), VmmError> { - let kvm = Kvm::new(kvm_capabilities)?; - // Set up Kvm Vm and register memory regions. - // Build custom CPU config if a custom template is provided. - let mut vm = Vm::new(&kvm)?; - - let resource_allocator = ResourceAllocator::new()?; - - // Instantiate the MMIO device manager. - let mmio_device_manager = MMIODeviceManager::new(); - - // Instantiate ACPI device manager. - let acpi_device_manager = ACPIDeviceManager::new(); - - let (vcpus, vcpus_exit_evt) = vm.create_vcpus(vcpu_count)?; - - #[cfg(target_arch = "x86_64")] - let pio_device_manager = { - // Make stdout non blocking. - set_stdout_nonblocking(); - - // Serial device setup. - let serial_device = setup_serial_device(event_manager, std::io::stdin(), io::stdout())?; - - // x86_64 uses the i8042 reset event as the Vmm exit event. - let reset_evt = vcpus_exit_evt.try_clone().map_err(VmmError::EventFd)?; - - // create pio dev manager with legacy devices - let mut pio_dev_mgr = - PortIODeviceManager::new(serial_device, reset_evt).map_err(VmmError::LegacyIOBus)?; - pio_dev_mgr - .register_devices(vm.fd()) - .map_err(VmmError::LegacyIOBus)?; - pio_dev_mgr - }; - - let vmm = Vmm { - events_observer: Some(std::io::stdin()), - instance_info: instance_info.clone(), - shutdown_exit_code: None, - kvm, - vm, - uffd: None, - vcpus_handles: Vec::new(), - vcpus_exit_evt, - resource_allocator, - mmio_device_manager, - #[cfg(target_arch = "x86_64")] - pio_device_manager, - acpi_device_manager, - }; - - Ok((vmm, vcpus)) -} - /// Builds and starts a microVM based on the current Firecracker VmResources configuration. /// /// The built microVM and all the created vCPUs start off in the paused state. @@ -207,8 +141,6 @@ pub fn build_microvm_for_boot( event_manager: &mut EventManager, seccomp_filters: &BpfThreadMap, ) -> Result>, StartMicrovmError> { - use self::StartMicrovmError::*; - // Timestamp for measuring microVM boot duration. let request_ts = TimestampUs::default(); @@ -216,7 +148,7 @@ pub fn build_microvm_for_boot( .boot_source .builder .as_ref() - .ok_or(MissingKernelConfig)?; + .ok_or(StartMicrovmError::MissingKernelConfig)?; let guest_memory = vm_resources .allocate_guest_memory() @@ -231,19 +163,19 @@ pub fn build_microvm_for_boot( .cpu_template .get_cpu_template()?; - let (mut vmm, mut vcpus) = create_vmm_and_vcpus( - instance_info, - event_manager, - vm_resources.machine_config.vcpu_count, - cpu_template.kvm_capabilities.clone(), - )?; + let kvm = Kvm::new(cpu_template.kvm_capabilities.clone())?; + // Set up Kvm Vm and register memory regions. + // Build custom CPU config if a custom template is provided. + let mut vm = Vm::new(&kvm)?; + let (mut vcpus, vcpus_exit_evt) = vm.create_vcpus(vm_resources.machine_config.vcpu_count)?; + vm.register_memory_regions(guest_memory)?; - vmm.vm - .register_memory_regions(guest_memory) - .map_err(VmmError::Vm)?; + let mut device_manager = DeviceManager::new(event_manager, &vcpus_exit_evt, &vm)?; - let entry_point = load_kernel(&boot_config.kernel_file, vmm.vm.guest_memory())?; - let initrd = InitrdConfig::from_config(boot_config, vmm.vm.guest_memory())?; + let vm = Arc::new(vm); + + let entry_point = load_kernel(&boot_config.kernel_file, vm.guest_memory())?; + let initrd = InitrdConfig::from_config(boot_config, vm.guest_memory())?; #[cfg(feature = "gdb")] let (gdb_tx, gdb_rx) = mpsc::channel(); @@ -254,55 +186,83 @@ pub fn build_microvm_for_boot( #[cfg(feature = "gdb")] let vcpu_fds = vcpus .iter() - .map(|vcpu| vcpu.copy_kvm_vcpu_fd(vmm.vm())) + .map(|vcpu| vcpu.copy_kvm_vcpu_fd(&vm)) .collect::, _>>()?; + if vm_resources.pci_enabled { + device_manager.enable_pci(&vm)?; + } else { + boot_cmdline.insert("pci", "off")?; + } + // The boot timer device needs to be the first device attached in order // to maintain the same MMIO address referenced in the documentation // and tests. if vm_resources.boot_timer { - attach_boot_timer_device(&mut vmm, request_ts)?; + device_manager.attach_boot_timer_device(&vm, request_ts)?; } if let Some(balloon) = vm_resources.balloon.get() { - attach_balloon_device(&mut vmm, &mut boot_cmdline, balloon, event_manager)?; + attach_balloon_device( + &mut device_manager, + &vm, + &mut boot_cmdline, + balloon, + event_manager, + )?; } attach_block_devices( - &mut vmm, + &mut device_manager, + &vm, &mut boot_cmdline, vm_resources.block.devices.iter(), event_manager, )?; attach_net_devices( - &mut vmm, + &mut device_manager, + &vm, &mut boot_cmdline, vm_resources.net_builder.iter(), event_manager, )?; if let Some(unix_vsock) = vm_resources.vsock.get() { - attach_unixsock_vsock_device(&mut vmm, &mut boot_cmdline, unix_vsock, event_manager)?; + attach_unixsock_vsock_device( + &mut device_manager, + &vm, + &mut boot_cmdline, + unix_vsock, + event_manager, + )?; } if let Some(entropy) = vm_resources.entropy.get() { - attach_entropy_device(&mut vmm, &mut boot_cmdline, entropy, event_manager)?; + attach_entropy_device( + &mut device_manager, + &vm, + &mut boot_cmdline, + entropy, + event_manager, + )?; } #[cfg(target_arch = "aarch64")] - attach_legacy_devices_aarch64(event_manager, &mut vmm, &mut boot_cmdline)?; + device_manager.attach_legacy_devices_aarch64(&vm, event_manager, &mut boot_cmdline)?; - attach_vmgenid_device(&mut vmm)?; + device_manager.attach_vmgenid_device(vm.guest_memory(), &vm)?; #[cfg(target_arch = "aarch64")] if vcpus[0].kvm_vcpu.supports_pvtime() { - setup_pvtime(&mut vmm, &mut vcpus)?; + setup_pvtime(&mut vm.resource_allocator(), &mut vcpus)?; } else { log::warn!("Vcpus do not support pvtime, steal time will not be reported to guest"); } configure_system_for_boot( - &mut vmm, + &kvm, + &vm, + &mut device_manager, vcpus.as_mut(), &vm_resources.machine_config, &cpu_template, @@ -311,6 +271,18 @@ pub fn build_microvm_for_boot( boot_cmdline, )?; + let vmm = Vmm { + events_observer: Some(std::io::stdin()), + instance_info: instance_info.clone(), + shutdown_exit_code: None, + kvm, + vm, + uffd: None, + vcpus_handles: Vec::new(), + vcpus_exit_evt, + device_manager, + }; + let vmm = Arc::new(Mutex::new(vmm)); #[cfg(feature = "gdb")] @@ -322,7 +294,7 @@ pub fn build_microvm_for_boot( entry_point.entry_addr, gdb_socket_path, ) - .map_err(GdbServer)?; + .map_err(StartMicrovmError::GdbServer)?; } else { debug!("No GDB socket provided not starting gdb server."); } @@ -334,7 +306,7 @@ pub fn build_microvm_for_boot( vcpus, seccomp_filters .get("vcpu") - .ok_or_else(|| MissingSeccompFilters("vcpu".to_string()))? + .ok_or_else(|| StartMicrovmError::MissingSeccompFilters("vcpu".to_string()))? .clone(), ) .map_err(VmmError::VcpuStart)?; @@ -346,7 +318,7 @@ pub fn build_microvm_for_boot( crate::seccomp::apply_filter( seccomp_filters .get("vmm") - .ok_or_else(|| MissingSeccompFilters("vmm".to_string()))?, + .ok_or_else(|| StartMicrovmError::MissingSeccompFilters("vmm".to_string()))?, ) .map_err(VmmError::SeccompFilters)?; @@ -411,10 +383,8 @@ pub enum BuildMicrovmFromSnapshotError { MissingVmmSeccompFilters, /// Failed to apply VMM secccomp filter: {0} SeccompFiltersInternal(#[from] crate::seccomp::InstallationError), - /// Failed to restore ACPI device manager: {0} - ACPIDeviManager(#[from] ACPIDeviceManagerRestoreError), - /// VMGenID update failed: {0} - VMGenIDUpdate(std::io::Error), + /// Failed to restore devices: {0} + RestoreDevices(#[from] DevicePersistError), } /// Builds and starts a microVM based on the provided MicrovmState. @@ -433,19 +403,19 @@ pub fn build_microvm_from_snapshot( ) -> Result>, BuildMicrovmFromSnapshotError> { // Build Vmm. debug!("event_start: build microvm from snapshot"); - let (mut vmm, mut vcpus) = create_vmm_and_vcpus( - instance_info, - event_manager, - vm_resources.machine_config.vcpu_count, - microvm_state.kvm_state.kvm_cap_modifiers.clone(), - ) - .map_err(StartMicrovmError::Internal)?; - vmm.vm - .register_memory_regions(guest_memory) - .map_err(VmmError::Vm) - .map_err(StartMicrovmError::Internal)?; - vmm.uffd = uffd; + let kvm = Kvm::new(microvm_state.kvm_state.kvm_cap_modifiers.clone()) + .map_err(StartMicrovmError::Kvm)?; + // Set up Kvm Vm and register memory regions. + // Build custom CPU config if a custom template is provided. + let mut vm = Vm::new(&kvm).map_err(StartMicrovmError::Vm)?; + + let (mut vcpus, vcpus_exit_evt) = vm + .create_vcpus(vm_resources.machine_config.vcpu_count) + .map_err(StartMicrovmError::Vm)?; + + vm.register_memory_regions(guest_memory) + .map_err(StartMicrovmError::Vm)?; #[cfg(target_arch = "x86_64")] { @@ -461,16 +431,6 @@ pub fn build_microvm_from_snapshot( } } - // Restore allocator state - #[cfg(target_arch = "aarch64")] - if let Some(pvtime_ipa) = vcpus[0].kvm_vcpu.pvtime_ipa { - allocate_pvtime_region( - &mut vmm, - vcpus.len(), - vm_allocator::AllocPolicy::ExactMatch(pvtime_ipa.0), - )?; - } - // Restore vcpus kvm state. for (vcpu, state) in vcpus.iter_mut().zip(microvm_state.vcpu_states.iter()) { vcpu.kvm_vcpu @@ -483,49 +443,46 @@ pub fn build_microvm_from_snapshot( { let mpidrs = construct_kvm_mpidrs(µvm_state.vcpu_states); // Restore kvm vm state. - vmm.vm.restore_state(&mpidrs, µvm_state.vm_state)?; + vm.restore_state(&mpidrs, µvm_state.vm_state)?; } // Restore kvm vm state. #[cfg(target_arch = "x86_64")] - vmm.vm.restore_state(µvm_state.vm_state)?; + vm.restore_state(µvm_state.vm_state)?; // Restore the boot source config paths. vm_resources.boot_source.config = microvm_state.vm_info.boot_source; + let vm = Arc::new(vm); + // Restore devices states. - let mmio_ctor_args = MMIODevManagerConstructorArgs { - mem: vmm.vm.guest_memory(), - vm: vmm.vm.fd(), + // Restoring VMGenID injects an interrupt in the guest to notify it about the new generation + // ID. As a result, we need to restore DeviceManager after restoring the KVM state, otherwise + // the injected interrupt will be overwritten. + let device_ctor_args = DeviceRestoreArgs { + mem: vm.guest_memory(), + vm: &vm, event_manager, - resource_allocator: &mut vmm.resource_allocator, vm_resources, instance_id: &instance_info.id, - restored_from_file: vmm.uffd.is_none(), + restored_from_file: uffd.is_none(), + vcpus_exit_evt: &vcpus_exit_evt, }; + #[allow(unused_mut)] + let mut device_manager = + DeviceManager::restore(device_ctor_args, µvm_state.device_states)?; - vmm.mmio_device_manager = - MMIODeviceManager::restore(mmio_ctor_args, µvm_state.device_states) - .map_err(MicrovmStateError::RestoreDevices)?; - vmm.emulate_serial_init()?; - - { - let acpi_ctor_args = ACPIDeviceManagerConstructorArgs { - mem: vmm.vm.guest_memory(), - resource_allocator: &mut vmm.resource_allocator, - vm: vmm.vm.fd(), - }; - - vmm.acpi_device_manager = - ACPIDeviceManager::restore(acpi_ctor_args, µvm_state.acpi_dev_state)?; - - // Inject the notification to VMGenID that we have resumed from a snapshot. - // This needs to happen before we resume vCPUs, so that we minimize the time between vCPUs - // resuming and notification being handled by the driver. - vmm.acpi_device_manager - .notify_vmgenid() - .map_err(BuildMicrovmFromSnapshotError::VMGenIDUpdate)?; - } + let mut vmm = Vmm { + events_observer: Some(std::io::stdin()), + instance_info: instance_info.clone(), + shutdown_exit_code: None, + kvm, + vm, + uffd, + vcpus_handles: Vec::new(), + vcpus_exit_evt, + device_manager, + }; // Move vcpus to their own threads and start their state machine in the 'Paused' state. vmm.start_vcpus( @@ -551,29 +508,6 @@ pub fn build_microvm_from_snapshot( Ok(vmm) } -/// Sets up the serial device. -pub fn setup_serial_device( - event_manager: &mut EventManager, - input: std::io::Stdin, - out: std::io::Stdout, -) -> Result>, VmmError> { - let interrupt_evt = EventFdTrigger::new(EventFd::new(EFD_NONBLOCK).map_err(VmmError::EventFd)?); - let kick_stdin_read_evt = - EventFdTrigger::new(EventFd::new(EFD_NONBLOCK).map_err(VmmError::EventFd)?); - let serial = Arc::new(Mutex::new(BusDevice::Serial(SerialWrapper { - serial: Serial::with_events( - interrupt_evt, - SerialEventsWrapper { - buffer_ready_event_fd: Some(kick_stdin_read_evt), - }, - SerialOut::Stdout(out), - ), - input: Some(input), - }))); - event_manager.add_subscriber(serial.clone()); - Ok(serial) -} - /// 64 bytes due to alignment requirement in 3.1 of https://www.kernel.org/doc/html/v5.8/virt/kvm/devices/vcpu.html#attribute-kvm-arm-vcpu-pvtime-ipa #[cfg(target_arch = "aarch64")] const STEALTIME_STRUCT_MEM_SIZE: u64 = 64; @@ -581,13 +515,12 @@ const STEALTIME_STRUCT_MEM_SIZE: u64 = 64; /// Helper method to allocate steal time region #[cfg(target_arch = "aarch64")] fn allocate_pvtime_region( - vmm: &mut Vmm, + resource_allocator: &mut ResourceAllocator, vcpu_count: usize, policy: vm_allocator::AllocPolicy, ) -> Result { let size = STEALTIME_STRUCT_MEM_SIZE * vcpu_count as u64; - let addr = vmm - .resource_allocator + let addr = resource_allocator .allocate_system_memory(size, STEALTIME_STRUCT_MEM_SIZE, policy) .map_err(StartMicrovmError::AllocateResources)?; Ok(GuestAddress(addr)) @@ -595,10 +528,16 @@ fn allocate_pvtime_region( /// Sets up pvtime for all vcpus #[cfg(target_arch = "aarch64")] -fn setup_pvtime(vmm: &mut Vmm, vcpus: &mut [Vcpu]) -> Result<(), StartMicrovmError> { +fn setup_pvtime( + resource_allocator: &mut ResourceAllocator, + vcpus: &mut [Vcpu], +) -> Result<(), StartMicrovmError> { // Alloc sys mem for steal time region - let pvtime_mem: GuestAddress = - allocate_pvtime_region(vmm, vcpus.len(), vm_allocator::AllocPolicy::LastMatch)?; + let pvtime_mem: GuestAddress = allocate_pvtime_region( + resource_allocator, + vcpus.len(), + vm_allocator::AllocPolicy::LastMatch, + )?; // Register all vcpus with pvtime device for (i, vcpu) in vcpus.iter_mut().enumerate() { @@ -612,111 +551,26 @@ fn setup_pvtime(vmm: &mut Vmm, vcpus: &mut [Vcpu]) -> Result<(), StartMicrovmErr Ok(()) } -#[cfg(target_arch = "aarch64")] -fn attach_legacy_devices_aarch64( - event_manager: &mut EventManager, - vmm: &mut Vmm, - cmdline: &mut LoaderKernelCmdline, -) -> Result<(), VmmError> { - // Serial device setup. - let cmdline_contains_console = cmdline - .as_cstring() - .map_err(|_| VmmError::Cmdline)? - .into_string() - .map_err(|_| VmmError::Cmdline)? - .contains("console="); - - if cmdline_contains_console { - // Make stdout non-blocking. - set_stdout_nonblocking(); - let serial = setup_serial_device(event_manager, std::io::stdin(), std::io::stdout())?; - vmm.mmio_device_manager - .register_mmio_serial(vmm.vm.fd(), &mut vmm.resource_allocator, serial, None) - .map_err(VmmError::RegisterMMIODevice)?; - vmm.mmio_device_manager - .add_mmio_serial_to_cmdline(cmdline) - .map_err(VmmError::RegisterMMIODevice)?; - } - - let rtc = RTCDevice(Rtc::with_events( - &crate::devices::legacy::rtc_pl031::METRICS, - )); - vmm.mmio_device_manager - .register_mmio_rtc(&mut vmm.resource_allocator, rtc, None) - .map_err(VmmError::RegisterMMIODevice) -} - -/// Attaches a VirtioDevice device to the device manager and event manager. -fn attach_virtio_device( - event_manager: &mut EventManager, - vmm: &mut Vmm, - id: String, - device: Arc>, - cmdline: &mut LoaderKernelCmdline, - is_vhost_user: bool, -) -> Result<(), MmioError> { - event_manager.add_subscriber(device.clone()); - - // The device mutex mustn't be locked here otherwise it will deadlock. - let device = MmioTransport::new(vmm.vm.guest_memory().clone(), device, is_vhost_user); - vmm.mmio_device_manager - .register_mmio_virtio_for_boot( - vmm.vm.fd(), - &mut vmm.resource_allocator, - id, - device, - cmdline, - ) - .map(|_| ()) -} - -pub(crate) fn attach_boot_timer_device( - vmm: &mut Vmm, - request_ts: TimestampUs, -) -> Result<(), MmioError> { - let boot_timer = crate::devices::pseudo::BootTimer::new(request_ts); - - vmm.mmio_device_manager - .register_mmio_boot_timer(&mut vmm.resource_allocator, boot_timer)?; - - Ok(()) -} - -fn attach_vmgenid_device(vmm: &mut Vmm) -> Result<(), StartMicrovmError> { - let vmgenid = VmGenId::new(vmm.vm.guest_memory(), &mut vmm.resource_allocator) - .map_err(StartMicrovmError::CreateVMGenID)?; - - vmm.acpi_device_manager - .attach_vmgenid(vmgenid, vmm.vm.fd()) - .map_err(StartMicrovmError::AttachVmgenidDevice)?; - - Ok(()) -} - fn attach_entropy_device( - vmm: &mut Vmm, + device_manager: &mut DeviceManager, + vm: &Arc, cmdline: &mut LoaderKernelCmdline, entropy_device: &Arc>, event_manager: &mut EventManager, -) -> Result<(), MmioError> { +) -> Result<(), AttachDeviceError> { let id = entropy_device .lock() .expect("Poisoned lock") .id() .to_string(); - attach_virtio_device( - event_manager, - vmm, - id, - entropy_device.clone(), - cmdline, - false, - ) + event_manager.add_subscriber(entropy_device.clone()); + device_manager.attach_virtio_device(vm, id, entropy_device.clone(), cmdline, false) } fn attach_block_devices<'a, I: Iterator>> + Debug>( - vmm: &mut Vmm, + device_manager: &mut DeviceManager, + vm: &Arc, cmdline: &mut LoaderKernelCmdline, blocks: I, event_manager: &mut EventManager, @@ -737,66 +591,52 @@ fn attach_block_devices<'a, I: Iterator>> + Debug>( (locked.id().to_string(), locked.is_vhost_user()) }; // The device mutex mustn't be locked here otherwise it will deadlock. - attach_virtio_device( - event_manager, - vmm, - id, - block.clone(), - cmdline, - is_vhost_user, - )?; + event_manager.add_subscriber(block.clone()); + device_manager.attach_virtio_device(vm, id, block.clone(), cmdline, is_vhost_user)?; } Ok(()) } fn attach_net_devices<'a, I: Iterator>> + Debug>( - vmm: &mut Vmm, + device_manager: &mut DeviceManager, + vm: &Arc, cmdline: &mut LoaderKernelCmdline, net_devices: I, event_manager: &mut EventManager, ) -> Result<(), StartMicrovmError> { for net_device in net_devices { let id = net_device.lock().expect("Poisoned lock").id().clone(); + event_manager.add_subscriber(net_device.clone()); // The device mutex mustn't be locked here otherwise it will deadlock. - attach_virtio_device(event_manager, vmm, id, net_device.clone(), cmdline, false)?; + device_manager.attach_virtio_device(vm, id, net_device.clone(), cmdline, false)?; } Ok(()) } fn attach_unixsock_vsock_device( - vmm: &mut Vmm, + device_manager: &mut DeviceManager, + vm: &Arc, cmdline: &mut LoaderKernelCmdline, unix_vsock: &Arc>>, event_manager: &mut EventManager, -) -> Result<(), MmioError> { +) -> Result<(), AttachDeviceError> { let id = String::from(unix_vsock.lock().expect("Poisoned lock").id()); + event_manager.add_subscriber(unix_vsock.clone()); // The device mutex mustn't be locked here otherwise it will deadlock. - attach_virtio_device(event_manager, vmm, id, unix_vsock.clone(), cmdline, false) + device_manager.attach_virtio_device(vm, id, unix_vsock.clone(), cmdline, false) } fn attach_balloon_device( - vmm: &mut Vmm, + device_manager: &mut DeviceManager, + vm: &Arc, cmdline: &mut LoaderKernelCmdline, balloon: &Arc>, event_manager: &mut EventManager, -) -> Result<(), MmioError> { +) -> Result<(), AttachDeviceError> { let id = String::from(balloon.lock().expect("Poisoned lock").id()); + event_manager.add_subscriber(balloon.clone()); // The device mutex mustn't be locked here otherwise it will deadlock. - attach_virtio_device(event_manager, vmm, id, balloon.clone(), cmdline, false) -} - -// Adds `O_NONBLOCK` to the stdout flags. -pub(crate) fn set_stdout_nonblocking() { - // SAFETY: Call is safe since parameters are valid. - let flags = unsafe { libc::fcntl(libc::STDOUT_FILENO, libc::F_GETFL, 0) }; - if flags < 0 { - error!("Could not get Firecracker stdout flags."); - } - // SAFETY: Call is safe since parameters are valid. - let rc = unsafe { libc::fcntl(libc::STDOUT_FILENO, libc::F_SETFL, flags | libc::O_NONBLOCK) }; - if rc < 0 { - error!("Could not set Firecracker stdout to non-blocking."); - } + device_manager.attach_virtio_device(vm, id, balloon.clone(), cmdline, false) } #[cfg(test)] @@ -806,8 +646,7 @@ pub(crate) mod tests { use vmm_sys_util::tempfile::TempFile; use super::*; - use crate::arch::DeviceType; - use crate::device_manager::resources::ResourceAllocator; + use crate::device_manager::tests::default_device_manager; use crate::devices::virtio::block::CacheType; use crate::devices::virtio::rng::device::ENTROPY_DEV_ID; use crate::devices::virtio::vsock::{TYPE_VSOCK, VSOCK_DEV_ID}; @@ -880,24 +719,6 @@ pub(crate) mod tests { pub(crate) fn default_vmm() -> Vmm { let (kvm, mut vm) = setup_vm_with_memory(mib_to_bytes(128)); - let mmio_device_manager = MMIODeviceManager::new(); - let acpi_device_manager = ACPIDeviceManager::new(); - #[cfg(target_arch = "x86_64")] - let pio_device_manager = PortIODeviceManager::new( - Arc::new(Mutex::new(BusDevice::Serial(SerialWrapper { - serial: Serial::with_events( - EventFdTrigger::new(EventFd::new(EFD_NONBLOCK).unwrap()), - SerialEventsWrapper { - buffer_ready_event_fd: None, - }, - SerialOut::Sink(std::io::sink()), - ), - input: None, - }))), - EventFd::new(libc::EFD_NONBLOCK).unwrap(), - ) - .unwrap(); - let (_, vcpus_exit_evt) = vm.create_vcpus(1).unwrap(); Vmm { @@ -905,15 +726,11 @@ pub(crate) mod tests { instance_info: InstanceInfo::default(), shutdown_exit_code: None, kvm, - vm, + vm: Arc::new(vm), uffd: None, vcpus_handles: Vec::new(), vcpus_exit_evt, - resource_allocator: ResourceAllocator::new().unwrap(), - mmio_device_manager, - #[cfg(target_arch = "x86_64")] - pio_device_manager, - acpi_device_manager, + device_manager: default_device_manager(), } } @@ -954,7 +771,8 @@ pub(crate) mod tests { } attach_block_devices( - vmm, + &mut vmm.device_manager, + &vmm.vm, cmdline, block_dev_configs.devices.iter(), event_manager, @@ -972,7 +790,13 @@ pub(crate) mod tests { let mut net_builder = NetBuilder::new(); net_builder.build(net_config).unwrap(); - let res = attach_net_devices(vmm, cmdline, net_builder.iter(), event_manager); + let res = attach_net_devices( + &mut vmm.device_manager, + &vmm.vm, + cmdline, + net_builder.iter(), + event_manager, + ); res.unwrap(); } @@ -993,7 +817,14 @@ pub(crate) mod tests { Arc::new(Mutex::new(mmds)), ); - attach_net_devices(vmm, cmdline, net_builder.iter(), event_manager).unwrap(); + attach_net_devices( + &mut vmm.device_manager, + &vmm.vm, + cmdline, + net_builder.iter(), + event_manager, + ) + .unwrap(); } pub(crate) fn insert_vsock_device( @@ -1006,11 +837,18 @@ pub(crate) mod tests { let vsock = VsockBuilder::create_unixsock_vsock(vsock_config).unwrap(); let vsock = Arc::new(Mutex::new(vsock)); - attach_unixsock_vsock_device(vmm, cmdline, &vsock, event_manager).unwrap(); + attach_unixsock_vsock_device( + &mut vmm.device_manager, + &vmm.vm, + cmdline, + &vsock, + event_manager, + ) + .unwrap(); assert!( - vmm.mmio_device_manager - .get_device(DeviceType::Virtio(TYPE_VSOCK), &vsock_dev_id) + vmm.device_manager + .get_virtio_device(TYPE_VSOCK, &vsock_dev_id) .is_some() ); } @@ -1024,19 +862,28 @@ pub(crate) mod tests { let mut builder = EntropyDeviceBuilder::new(); let entropy = builder.build(entropy_config).unwrap(); - attach_entropy_device(vmm, cmdline, &entropy, event_manager).unwrap(); + attach_entropy_device( + &mut vmm.device_manager, + &vmm.vm, + cmdline, + &entropy, + event_manager, + ) + .unwrap(); assert!( - vmm.mmio_device_manager - .get_device(DeviceType::Virtio(TYPE_RNG), ENTROPY_DEV_ID) + vmm.device_manager + .get_virtio_device(TYPE_RNG, ENTROPY_DEV_ID) .is_some() ); } #[cfg(target_arch = "x86_64")] pub(crate) fn insert_vmgenid_device(vmm: &mut Vmm) { - attach_vmgenid_device(vmm).unwrap(); - assert!(vmm.acpi_device_manager.vmgenid.is_some()); + vmm.device_manager + .attach_vmgenid_device(vmm.vm.guest_memory(), &vmm.vm) + .unwrap(); + assert!(vmm.device_manager.acpi_devices.vmgenid.is_some()); } pub(crate) fn insert_balloon_device( @@ -1049,11 +896,18 @@ pub(crate) mod tests { builder.set(balloon_config).unwrap(); let balloon = builder.get().unwrap(); - attach_balloon_device(vmm, cmdline, balloon, event_manager).unwrap(); + attach_balloon_device( + &mut vmm.device_manager, + &vmm.vm, + cmdline, + balloon, + event_manager, + ) + .unwrap(); assert!( - vmm.mmio_device_manager - .get_device(DeviceType::Virtio(TYPE_BALLOON), BALLOON_DEV_ID) + vmm.device_manager + .get_virtio_device(TYPE_BALLOON, BALLOON_DEV_ID) .is_some() ); } @@ -1103,8 +957,8 @@ pub(crate) mod tests { insert_block_devices(&mut vmm, &mut cmdline, &mut event_manager, block_configs); assert!(cmdline_contains(&cmdline, "root=/dev/vda ro")); assert!( - vmm.mmio_device_manager - .get_device(DeviceType::Virtio(TYPE_BLOCK), drive_id.as_str()) + vmm.device_manager + .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); } @@ -1124,8 +978,8 @@ pub(crate) mod tests { insert_block_devices(&mut vmm, &mut cmdline, &mut event_manager, block_configs); assert!(cmdline_contains(&cmdline, "root=PARTUUID=0eaa91a0-01 rw")); assert!( - vmm.mmio_device_manager - .get_device(DeviceType::Virtio(TYPE_BLOCK), drive_id.as_str()) + vmm.device_manager + .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); } @@ -1146,8 +1000,8 @@ pub(crate) mod tests { assert!(!cmdline_contains(&cmdline, "root=PARTUUID=")); assert!(!cmdline_contains(&cmdline, "root=/dev/vda")); assert!( - vmm.mmio_device_manager - .get_device(DeviceType::Virtio(TYPE_BLOCK), drive_id.as_str()) + vmm.device_manager + .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); } @@ -1183,18 +1037,18 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=PARTUUID=0eaa91a0-01 rw")); assert!( - vmm.mmio_device_manager - .get_device(DeviceType::Virtio(TYPE_BLOCK), "root") + vmm.device_manager + .get_virtio_device(TYPE_BLOCK, "root") .is_some() ); assert!( - vmm.mmio_device_manager - .get_device(DeviceType::Virtio(TYPE_BLOCK), "secondary") + vmm.device_manager + .get_virtio_device(TYPE_BLOCK, "secondary") .is_some() ); assert!( - vmm.mmio_device_manager - .get_device(DeviceType::Virtio(TYPE_BLOCK), "third") + vmm.device_manager + .get_virtio_device(TYPE_BLOCK, "third") .is_some() ); @@ -1202,8 +1056,8 @@ pub(crate) mod tests { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] assert!(cmdline_contains( &cmdline, - "virtio_mmio.device=4K@0xd0000000:5 virtio_mmio.device=4K@0xd0001000:6 \ - virtio_mmio.device=4K@0xd0002000:7" + "virtio_mmio.device=4K@0xc0001000:5 virtio_mmio.device=4K@0xc0002000:6 \ + virtio_mmio.device=4K@0xc0003000:7" )); } @@ -1222,8 +1076,8 @@ pub(crate) mod tests { insert_block_devices(&mut vmm, &mut cmdline, &mut event_manager, block_configs); assert!(cmdline_contains(&cmdline, "root=/dev/vda rw")); assert!( - vmm.mmio_device_manager - .get_device(DeviceType::Virtio(TYPE_BLOCK), drive_id.as_str()) + vmm.device_manager + .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); } @@ -1243,8 +1097,8 @@ pub(crate) mod tests { insert_block_devices(&mut vmm, &mut cmdline, &mut event_manager, block_configs); assert!(cmdline_contains(&cmdline, "root=PARTUUID=0eaa91a0-01 ro")); assert!( - vmm.mmio_device_manager - .get_device(DeviceType::Virtio(TYPE_BLOCK), drive_id.as_str()) + vmm.device_manager + .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); } @@ -1264,8 +1118,8 @@ pub(crate) mod tests { insert_block_devices(&mut vmm, &mut cmdline, &mut event_manager, block_configs); assert!(cmdline_contains(&cmdline, "root=/dev/vda rw")); assert!( - vmm.mmio_device_manager - .get_device(DeviceType::Virtio(TYPE_BLOCK), drive_id.as_str()) + vmm.device_manager + .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); } @@ -1276,13 +1130,11 @@ pub(crate) mod tests { let mut vmm = default_vmm(); let request_ts = TimestampUs::default(); - let res = attach_boot_timer_device(&mut vmm, request_ts); + let res = vmm + .device_manager + .attach_boot_timer_device(&vmm.vm, request_ts); res.unwrap(); - assert!( - vmm.mmio_device_manager - .get_device(DeviceType::BootTimer, &DeviceType::BootTimer.to_string()) - .is_some() - ); + assert!(vmm.device_manager.mmio_devices.boot_timer.is_some()); } #[test] @@ -1302,7 +1154,7 @@ pub(crate) mod tests { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] assert!(cmdline_contains( &cmdline, - "virtio_mmio.device=4K@0xd0000000:5" + "virtio_mmio.device=4K@0xc0001000:5" )); } @@ -1319,7 +1171,7 @@ pub(crate) mod tests { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] assert!(cmdline_contains( &cmdline, - "virtio_mmio.device=4K@0xd0000000:5" + "virtio_mmio.device=4K@0xc0001000:5" )); } @@ -1338,7 +1190,7 @@ pub(crate) mod tests { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] assert!(cmdline_contains( &cmdline, - "virtio_mmio.device=4K@0xd0000000:5" + "virtio_mmio.device=4K@0xc0001000:5" )); } } diff --git a/src/vmm/src/device_manager/acpi.rs b/src/vmm/src/device_manager/acpi.rs index 78f1254d2fa..874443fcc5c 100644 --- a/src/vmm/src/device_manager/acpi.rs +++ b/src/vmm/src/device_manager/acpi.rs @@ -2,11 +2,11 @@ // SPDX-License-Identifier: Apache-2.0 use acpi_tables::{Aml, aml}; -use kvm_ioctls::VmFd; +use crate::Vm; use crate::devices::acpi::vmgenid::VmGenId; -#[derive(Debug)] +#[derive(Debug, Default)] pub struct ACPIDeviceManager { /// VMGenID device pub vmgenid: Option, @@ -15,18 +15,14 @@ pub struct ACPIDeviceManager { impl ACPIDeviceManager { /// Create a new ACPIDeviceManager object pub fn new() -> Self { - Self { vmgenid: None } + Default::default() } /// Attach a new VMGenID device to the microVM /// /// This will register the device's interrupt with KVM - pub fn attach_vmgenid( - &mut self, - vmgenid: VmGenId, - vm_fd: &VmFd, - ) -> Result<(), kvm_ioctls::Error> { - vm_fd.register_irqfd(&vmgenid.interrupt_evt, vmgenid.gsi)?; + pub fn attach_vmgenid(&mut self, vmgenid: VmGenId, vm: &Vm) -> Result<(), kvm_ioctls::Error> { + vm.register_irq(&vmgenid.interrupt_evt, vmgenid.gsi)?; self.vmgenid = Some(vmgenid); Ok(()) } @@ -68,7 +64,7 @@ impl Aml for ACPIDeviceManager { // We know that the maximum IRQ number fits in a u8. We have up to // 32 IRQs in x86 and up to 128 in // ARM (look into - // `vmm::crate::arch::layout::IRQ_MAX`) + // `vmm::crate::arch::layout::GSI_LEGACY_END`) #[allow(clippy::cast_possible_truncation)] &aml::Equal::new(&aml::Arg(0), &(vmgenid.gsi as u8)), vec![&aml::Notify::new( diff --git a/src/vmm/src/device_manager/legacy.rs b/src/vmm/src/device_manager/legacy.rs index 20b008769a5..d0194e24e62 100644 --- a/src/vmm/src/device_manager/legacy.rs +++ b/src/vmm/src/device_manager/legacy.rs @@ -11,20 +11,19 @@ use std::sync::{Arc, Mutex}; use acpi_tables::aml::AmlError; use acpi_tables::{Aml, aml}; -use kvm_ioctls::VmFd; use libc::EFD_NONBLOCK; use vm_superio::Serial; use vmm_sys_util::eventfd::EventFd; -use crate::devices::bus::BusDevice; +use crate::Vm; use crate::devices::legacy::serial::SerialOut; -use crate::devices::legacy::{EventFdTrigger, SerialDevice, SerialEventsWrapper}; +use crate::devices::legacy::{EventFdTrigger, I8042Device, SerialDevice, SerialEventsWrapper}; /// Errors corresponding to the `PortIODeviceManager`. #[derive(Debug, derive_more::From, thiserror::Error, displaydoc::Display)] pub enum LegacyDeviceError { /// Failed to add legacy device to Bus: {0} - BusError(crate::devices::BusError), + BusError(vm_device::BusError), /// Failed to create EventFd: {0} EventFd(std::io::Error), } @@ -34,11 +33,10 @@ pub enum LegacyDeviceError { /// The `LegacyDeviceManger` should be initialized only by using the constructor. #[derive(Debug)] pub struct PortIODeviceManager { - pub io_bus: crate::devices::Bus, // BusDevice::Serial - pub stdio_serial: Arc>, + pub stdio_serial: Arc>, // BusDevice::I8042Device - pub i8042: Arc>, + pub i8042: Arc>, // Communication event on ports 1 & 3. pub com_evt_1_3: EventFdTrigger, @@ -73,29 +71,24 @@ impl PortIODeviceManager { /// Create a new DeviceManager handling legacy devices (uart, i8042). pub fn new( - serial: Arc>, - i8042_reset_evfd: EventFd, + stdio_serial: Arc>, + i8042: Arc>, ) -> Result { - debug_assert!(matches!(*serial.lock().unwrap(), BusDevice::Serial(_))); - let io_bus = crate::devices::Bus::new(); - let com_evt_1_3 = serial + let com_evt_1_3 = stdio_serial .lock() .expect("Poisoned lock") - .serial_mut() - .unwrap() .serial .interrupt_evt() .try_clone()?; let com_evt_2_4 = EventFdTrigger::new(EventFd::new(EFD_NONBLOCK)?); - let kbd_evt = EventFd::new(libc::EFD_NONBLOCK)?; - - let i8042 = Arc::new(Mutex::new(BusDevice::I8042Device( - crate::devices::legacy::I8042Device::new(i8042_reset_evfd, kbd_evt.try_clone()?), - ))); + let kbd_evt = i8042 + .lock() + .expect("Poisoned lock") + .kbd_interrupt_evt + .try_clone()?; Ok(PortIODeviceManager { - io_bus, - stdio_serial: serial, + stdio_serial, i8042, com_evt_1_3, com_evt_2_4, @@ -104,8 +97,8 @@ impl PortIODeviceManager { } /// Register supported legacy devices. - pub fn register_devices(&mut self, vm_fd: &VmFd) -> Result<(), LegacyDeviceError> { - let serial_2_4 = Arc::new(Mutex::new(BusDevice::Serial(SerialDevice { + pub fn register_devices(&mut self, vm: &Vm) -> Result<(), LegacyDeviceError> { + let serial_2_4 = Arc::new(Mutex::new(SerialDevice { serial: Serial::with_events( self.com_evt_2_4.try_clone()?.try_clone()?, SerialEventsWrapper { @@ -114,8 +107,8 @@ impl PortIODeviceManager { SerialOut::Sink(std::io::sink()), ), input: None, - }))); - let serial_1_3 = Arc::new(Mutex::new(BusDevice::Serial(SerialDevice { + })); + let serial_1_3 = Arc::new(Mutex::new(SerialDevice { serial: Serial::with_events( self.com_evt_1_3.try_clone()?.try_clone()?, SerialEventsWrapper { @@ -124,45 +117,44 @@ impl PortIODeviceManager { SerialOut::Sink(std::io::sink()), ), input: None, - }))); - self.io_bus.insert( + })); + + let io_bus = &vm.pio_bus; + io_bus.insert( self.stdio_serial.clone(), Self::SERIAL_PORT_ADDRESSES[0], Self::SERIAL_PORT_SIZE, )?; - self.io_bus.insert( + io_bus.insert( serial_2_4.clone(), Self::SERIAL_PORT_ADDRESSES[1], Self::SERIAL_PORT_SIZE, )?; - self.io_bus.insert( + io_bus.insert( serial_1_3, Self::SERIAL_PORT_ADDRESSES[2], Self::SERIAL_PORT_SIZE, )?; - self.io_bus.insert( + io_bus.insert( serial_2_4, Self::SERIAL_PORT_ADDRESSES[3], Self::SERIAL_PORT_SIZE, )?; - self.io_bus.insert( + io_bus.insert( self.i8042.clone(), Self::I8042_KDB_DATA_REGISTER_ADDRESS, Self::I8042_KDB_DATA_REGISTER_SIZE, )?; - vm_fd - .register_irqfd(&self.com_evt_1_3, Self::COM_EVT_1_3_GSI) + vm.register_irq(&self.com_evt_1_3, Self::COM_EVT_1_3_GSI) .map_err(|e| { LegacyDeviceError::EventFd(std::io::Error::from_raw_os_error(e.errno())) })?; - vm_fd - .register_irqfd(&self.com_evt_2_4, Self::COM_EVT_2_4_GSI) + vm.register_irq(&self.com_evt_2_4, Self::COM_EVT_2_4_GSI) .map_err(|e| { LegacyDeviceError::EventFd(std::io::Error::from_raw_os_error(e.errno())) })?; - vm_fd - .register_irqfd(&self.kbd_evt, Self::KBD_EVT_GSI) + vm.register_irq(&self.kbd_evt, Self::KBD_EVT_GSI) .map_err(|e| { LegacyDeviceError::EventFd(std::io::Error::from_raw_os_error(e.errno())) })?; @@ -251,7 +243,7 @@ mod tests { let (_, vm) = setup_vm_with_memory(0x1000); vm.setup_irqchip().unwrap(); let mut ldm = PortIODeviceManager::new( - Arc::new(Mutex::new(BusDevice::Serial(SerialDevice { + Arc::new(Mutex::new(SerialDevice { serial: Serial::with_events( EventFdTrigger::new(EventFd::new(EFD_NONBLOCK).unwrap()), SerialEventsWrapper { @@ -260,10 +252,12 @@ mod tests { SerialOut::Sink(std::io::sink()), ), input: None, - }))), - EventFd::new(libc::EFD_NONBLOCK).unwrap(), + })), + Arc::new(Mutex::new( + I8042Device::new(EventFd::new(libc::EFD_NONBLOCK).unwrap()).unwrap(), + )), ) .unwrap(); - ldm.register_devices(vm.fd()).unwrap(); + ldm.register_devices(&vm).unwrap(); } } diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 9a7dc775295..a87646b11cf 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -11,31 +11,25 @@ use std::sync::{Arc, Mutex}; #[cfg(target_arch = "x86_64")] use acpi_tables::{Aml, aml}; -use kvm_ioctls::{IoEventAddress, VmFd}; +use kvm_ioctls::IoEventAddress; use linux_loader::cmdline as kernel_cmdline; #[cfg(target_arch = "x86_64")] use log::debug; -use log::info; use serde::{Deserialize, Serialize}; use vm_allocator::AllocPolicy; -use super::resources::ResourceAllocator; -use crate::arch::DeviceType; -use crate::arch::DeviceType::Virtio; -use crate::devices::BusDevice; +use crate::Vm; +use crate::arch::BOOT_DEVICE_MEM_START; #[cfg(target_arch = "aarch64")] -use crate::devices::legacy::RTCDevice; +use crate::arch::{RTC_MEM_START, SERIAL_MEM_START}; +#[cfg(target_arch = "aarch64")] +use crate::devices::legacy::{RTCDevice, SerialDevice}; use crate::devices::pseudo::BootTimer; -use crate::devices::virtio::balloon::Balloon; -use crate::devices::virtio::block::device::Block; use crate::devices::virtio::device::VirtioDevice; -use crate::devices::virtio::mmio::MmioTransport; -use crate::devices::virtio::net::Net; -use crate::devices::virtio::rng::Entropy; -use crate::devices::virtio::vsock::{TYPE_VSOCK, Vsock, VsockUnixBackend}; -use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET, TYPE_RNG}; +use crate::devices::virtio::transport::mmio::MmioTransport; #[cfg(target_arch = "x86_64")] use crate::vstate::memory::GuestAddress; +use crate::vstate::resources::ResourceAllocator; /// Errors for MMIO device manager. #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -43,15 +37,17 @@ pub enum MmioError { /// Failed to allocate requested resource: {0} Allocator(#[from] vm_allocator::Error), /// Failed to insert device on the bus: {0} - BusInsert(crate::devices::BusError), + BusInsert(#[from] vm_device::BusError), /// Failed to allocate requested resourc: {0} - Cmdline(linux_loader::cmdline::Error), + Cmdline(#[from] linux_loader::cmdline::Error), /// Failed to find the device on the bus. DeviceNotFound, /// Invalid device type found on the MMIO bus. InvalidDeviceType, /// {0} InternalDeviceError(String), + /// Could not create IRQ for MMIO device: {0} + CreateIrq(#[from] std::io::Error), /// Invalid MMIO IRQ configuration. InvalidIrqConfig, /// Failed to register IO event: {0} @@ -71,14 +67,14 @@ pub enum MmioError { pub const MMIO_LEN: u64 = 0x1000; /// Stores the address range and irq allocated to this device. -#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct MMIODeviceInfo { /// Mmio address at which the device is registered. pub addr: u64, /// Mmio addr range length. pub len: u64, - /// Used Irq line for the device. - pub irq: Option, + /// Used GSI (interrupt line) for the device. + pub gsi: Option, } #[cfg(target_arch = "x86_64")] @@ -86,12 +82,12 @@ fn add_virtio_aml( dsdt_data: &mut Vec, addr: u64, len: u64, - irq: u32, + gsi: u32, ) -> Result<(), aml::AmlError> { - let dev_id = irq - crate::arch::GSI_BASE; + let dev_id = gsi - crate::arch::GSI_LEGACY_START; debug!( - "acpi: Building AML for VirtIO device _SB_.V{:03}. memory range: {:#010x}:{} irq: {}", - dev_id, addr, len, irq + "acpi: Building AML for VirtIO device _SB_.V{:03}. memory range: {:#010x}:{} gsi: {}", + dev_id, addr, len, gsi ); aml::Device::new( format!("V{:03}", dev_id).as_str().try_into()?, @@ -107,7 +103,7 @@ fn add_virtio_aml( addr.try_into().unwrap(), len.try_into().unwrap(), ), - &aml::Interrupt::new(true, true, false, false, irq), + &aml::Interrupt::new(true, true, false, false, gsi), ]), )?, ], @@ -115,11 +111,29 @@ fn add_virtio_aml( .append_aml_bytes(dsdt_data) } +#[derive(Debug, Clone)] +/// A descriptor for MMIO devices +pub struct MMIODevice { + /// MMIO resources allocated to the device + pub(crate) resources: MMIODeviceInfo, + /// The actual device + pub(crate) inner: Arc>, +} + /// Manages the complexities of registering a MMIO device. -#[derive(Debug)] +#[derive(Debug, Default)] pub struct MMIODeviceManager { - pub(crate) bus: crate::devices::Bus, - pub(crate) id_to_dev_info: HashMap<(DeviceType, String), MMIODeviceInfo>, + /// VirtIO devices using an MMIO transport layer + pub(crate) virtio_devices: HashMap<(u32, String), MMIODevice>, + /// Boot timer device + pub(crate) boot_timer: Option>, + #[cfg(target_arch = "aarch64")] + /// Real-Time clock on Aarch64 platforms + pub(crate) rtc: Option>, + #[cfg(target_arch = "aarch64")] + /// Serial device on Aarch64 platforms + pub(crate) serial: Option>, + #[cfg(target_arch = "x86_64")] // We create the AML byte code for every VirtIO device in the order we build // it, so that we ensure the root block device is appears first in the DSDT. // This is needed, so that the root device appears as `/dev/vda` in the guest @@ -127,19 +141,13 @@ pub struct MMIODeviceManager { // The alternative would be that we iterate the bus to get the data after all // of the devices are build. However, iterating the bus won't give us the // devices in the order they were added. - #[cfg(target_arch = "x86_64")] pub(crate) dsdt_data: Vec, } impl MMIODeviceManager { /// Create a new DeviceManager handling mmio devices (virtio net, block). pub fn new() -> MMIODeviceManager { - MMIODeviceManager { - bus: crate::devices::Bus::new(), - id_to_dev_info: HashMap::new(), - #[cfg(target_arch = "x86_64")] - dsdt_data: vec![], - } + Default::default() } /// Allocates resources for a new device to be added. @@ -148,71 +156,59 @@ impl MMIODeviceManager { resource_allocator: &mut ResourceAllocator, irq_count: u32, ) -> Result { - let irq = match resource_allocator.allocate_gsi(irq_count)?[..] { + let gsi = match resource_allocator.allocate_gsi_legacy(irq_count)?[..] { [] => None, - [irq] => Some(irq), + [gsi] => Some(gsi), _ => return Err(MmioError::InvalidIrqConfig), }; let device_info = MMIODeviceInfo { - addr: resource_allocator.allocate_mmio_memory( + addr: resource_allocator.allocate_32bit_mmio_memory( MMIO_LEN, MMIO_LEN, AllocPolicy::FirstMatch, )?, len: MMIO_LEN, - irq, + gsi, }; Ok(device_info) } - /// Register a device at some MMIO address. - fn register_mmio_device( - &mut self, - identifier: (DeviceType, String), - device_info: MMIODeviceInfo, - device: Arc>, - ) -> Result<(), MmioError> { - self.bus - .insert(device, device_info.addr, device_info.len) - .map_err(MmioError::BusInsert)?; - self.id_to_dev_info.insert(identifier, device_info); - Ok(()) - } - /// Register a virtio-over-MMIO device to be used via MMIO transport at a specific slot. pub fn register_mmio_virtio( &mut self, - vm: &VmFd, + vm: &Vm, device_id: String, - mmio_device: MmioTransport, - device_info: &MMIODeviceInfo, + device: MMIODevice, ) -> Result<(), MmioError> { // Our virtio devices are currently hardcoded to use a single IRQ. // Validate that requirement. - let Some(irq) = device_info.irq else { - return Err(MmioError::InvalidIrqConfig); - }; + let gsi = device.resources.gsi.ok_or(MmioError::InvalidIrqConfig)?; let identifier; { + let mmio_device = device.inner.lock().expect("Poisoned lock"); let locked_device = mmio_device.locked_device(); - identifier = (DeviceType::Virtio(locked_device.device_type()), device_id); + identifier = (locked_device.device_type(), device_id); for (i, queue_evt) in locked_device.queue_events().iter().enumerate() { let io_addr = IoEventAddress::Mmio( - device_info.addr + u64::from(crate::devices::virtio::NOTIFY_REG_OFFSET), + device.resources.addr + u64::from(crate::devices::virtio::NOTIFY_REG_OFFSET), ); - vm.register_ioevent(queue_evt, &io_addr, u32::try_from(i).unwrap()) + vm.fd() + .register_ioevent(queue_evt, &io_addr, u32::try_from(i).unwrap()) .map_err(MmioError::RegisterIoEvent)?; } - vm.register_irqfd(&locked_device.interrupt_trigger().irq_evt, irq) + vm.register_irq(&mmio_device.interrupt.irq_evt, gsi) .map_err(MmioError::RegisterIrqFd)?; } - self.register_mmio_device( - identifier, - device_info.clone(), - Arc::new(Mutex::new(BusDevice::MmioTransport(mmio_device))), - ) + vm.common.mmio_bus.insert( + device.inner.clone(), + device.resources.addr, + device.resources.len, + )?; + self.virtio_devices.insert(identifier, device); + + Ok(()) } /// Append a registered virtio-over-MMIO device to the kernel cmdline. @@ -222,7 +218,7 @@ impl MMIODeviceManager { device_info: &MMIODeviceInfo, ) -> Result<(), MmioError> { // as per doc, [virtio_mmio.]device=@: needs to be appended - // to kernel command line for virtio mmio devices to get recongnized + // to kernel command line for virtio mmio devices to get recognized // the size parameter has to be transformed to KiB, so dividing hexadecimal value in // bytes to 1024; further, the '{}' formatting rust construct will automatically // transform it to decimal @@ -230,7 +226,7 @@ impl MMIODeviceManager { .add_virtio_mmio_device( device_info.len, GuestAddress(device_info.addr), - device_info.irq.unwrap(), + device_info.gsi.unwrap(), None, ) .map_err(MmioError::Cmdline) @@ -240,27 +236,30 @@ impl MMIODeviceManager { /// to the boot cmdline. pub fn register_mmio_virtio_for_boot( &mut self, - vm: &VmFd, - resource_allocator: &mut ResourceAllocator, + vm: &Vm, device_id: String, mmio_device: MmioTransport, _cmdline: &mut kernel_cmdline::Cmdline, - ) -> Result { - let device_info = self.allocate_mmio_resources(resource_allocator, 1)?; - self.register_mmio_virtio(vm, device_id, mmio_device, &device_info)?; + ) -> Result<(), MmioError> { + let device = MMIODevice { + resources: self.allocate_mmio_resources(&mut vm.resource_allocator(), 1)?, + inner: Arc::new(Mutex::new(mmio_device)), + }; + #[cfg(target_arch = "x86_64")] { - Self::add_virtio_device_to_cmdline(_cmdline, &device_info)?; + Self::add_virtio_device_to_cmdline(_cmdline, &device.resources)?; add_virtio_aml( &mut self.dsdt_data, - device_info.addr, - device_info.len, + device.resources.addr, + device.resources.len, // We are sure that `irqs` has at least one element; allocate_mmio_resources makes // sure of it. - device_info.irq.unwrap(), + device.resources.gsi.unwrap(), )?; } - Ok(device_info) + self.register_mmio_virtio(vm, device_id, device)?; + Ok(()) } #[cfg(target_arch = "aarch64")] @@ -268,9 +267,8 @@ impl MMIODeviceManager { /// otherwise allocate a new MMIO resources for it. pub fn register_mmio_serial( &mut self, - vm: &VmFd, - resource_allocator: &mut ResourceAllocator, - serial: Arc>, + vm: &Vm, + serial: Arc>, device_info_opt: Option, ) -> Result<(), MmioError> { // Create a new MMIODeviceInfo object on boot path or unwrap the @@ -278,39 +276,49 @@ impl MMIODeviceManager { let device_info = if let Some(device_info) = device_info_opt { device_info } else { - self.allocate_mmio_resources(resource_allocator, 1)? + let gsi = vm.resource_allocator().allocate_gsi_legacy(1)?; + MMIODeviceInfo { + addr: SERIAL_MEM_START, + len: MMIO_LEN, + gsi: Some(gsi[0]), + } }; - vm.register_irqfd( - serial - .lock() - .expect("Poisoned lock") - .serial_ref() - .unwrap() - .serial - .interrupt_evt(), - device_info.irq.unwrap(), + vm.register_irq( + serial.lock().expect("Poisoned lock").serial.interrupt_evt(), + device_info.gsi.unwrap(), ) .map_err(MmioError::RegisterIrqFd)?; - let identifier = (DeviceType::Serial, DeviceType::Serial.to_string()); - // Register the newly created Serial object. - self.register_mmio_device(identifier, device_info, serial) + let device = MMIODevice { + resources: device_info, + inner: serial, + }; + + vm.common.mmio_bus.insert( + device.inner.clone(), + device.resources.addr, + device.resources.len, + )?; + + self.serial = Some(device); + Ok(()) } #[cfg(target_arch = "aarch64")] /// Append the registered early console to the kernel cmdline. + /// + /// This assumes that the device has been registered with the device manager. pub fn add_mmio_serial_to_cmdline( &self, cmdline: &mut kernel_cmdline::Cmdline, ) -> Result<(), MmioError> { - let device_info = self - .id_to_dev_info - .get(&(DeviceType::Serial, DeviceType::Serial.to_string())) - .ok_or(MmioError::DeviceNotFound)?; - cmdline - .insert("earlycon", &format!("uart,mmio,0x{:08x}", device_info.addr)) - .map_err(MmioError::Cmdline) + let device = self.serial.as_ref().unwrap(); + cmdline.insert( + "earlycon", + &format!("uart,mmio,0x{:08x}", device.resources.addr), + )?; + Ok(()) } #[cfg(target_arch = "aarch64")] @@ -318,8 +326,8 @@ impl MMIODeviceManager { /// given as parameter, otherwise allocate a new MMIO resources for it. pub fn register_mmio_rtc( &mut self, - resource_allocator: &mut ResourceAllocator, - rtc: RTCDevice, + vm: &Vm, + rtc: Arc>, device_info_opt: Option, ) -> Result<(), MmioError> { // Create a new MMIODeviceInfo object on boot path or unwrap the @@ -327,91 +335,74 @@ impl MMIODeviceManager { let device_info = if let Some(device_info) = device_info_opt { device_info } else { - self.allocate_mmio_resources(resource_allocator, 1)? + let gsi = vm.resource_allocator().allocate_gsi_legacy(1)?; + MMIODeviceInfo { + addr: RTC_MEM_START, + len: MMIO_LEN, + gsi: Some(gsi[0]), + } }; - // Create a new identifier for the RTC device. - let identifier = (DeviceType::Rtc, DeviceType::Rtc.to_string()); - // Attach the newly created RTC device. - self.register_mmio_device( - identifier, - device_info, - Arc::new(Mutex::new(BusDevice::RTCDevice(rtc))), - ) + let device = MMIODevice { + resources: device_info, + inner: rtc, + }; + + vm.common.mmio_bus.insert( + device.inner.clone(), + device.resources.addr, + device.resources.len, + )?; + self.rtc = Some(device); + Ok(()) } /// Register a boot timer device. pub fn register_mmio_boot_timer( &mut self, - resource_allocator: &mut ResourceAllocator, - device: BootTimer, + mmio_bus: &vm_device::Bus, + boot_timer: Arc>, ) -> Result<(), MmioError> { // Attach a new boot timer device. - let device_info = self.allocate_mmio_resources(resource_allocator, 0)?; + let device_info = MMIODeviceInfo { + addr: BOOT_DEVICE_MEM_START, + len: MMIO_LEN, + gsi: None, + }; - let identifier = (DeviceType::BootTimer, DeviceType::BootTimer.to_string()); - self.register_mmio_device( - identifier, - device_info, - Arc::new(Mutex::new(BusDevice::BootTimer(device))), - ) - } + let device = MMIODevice { + resources: device_info, + inner: boot_timer, + }; + + mmio_bus.insert( + device.inner.clone(), + device.resources.addr, + device.resources.len, + )?; + self.boot_timer = Some(device); - /// Gets the information of the devices registered up to some point in time. - pub fn get_device_info(&self) -> &HashMap<(DeviceType, String), MMIODeviceInfo> { - &self.id_to_dev_info + Ok(()) } /// Gets the specified device. - pub fn get_device( + pub fn get_virtio_device( &self, - device_type: DeviceType, + virtio_type: u32, device_id: &str, - ) -> Option<&Mutex> { - if let Some(device_info) = self - .id_to_dev_info - .get(&(device_type, device_id.to_string())) - { - if let Some((_, device)) = self.bus.get_device(device_info.addr) { - return Some(device); - } - } - None - } - - /// Run fn for each registered device. - pub fn for_each_device(&self, mut f: F) -> Result<(), E> - where - F: FnMut(&DeviceType, &String, &MMIODeviceInfo, &Mutex) -> Result<(), E>, - { - for ((device_type, device_id), device_info) in self.get_device_info().iter() { - let bus_device = self - .get_device(*device_type, device_id) - // Safe to unwrap() because we know the device exists. - .unwrap(); - f(device_type, device_id, device_info, bus_device)?; - } - Ok(()) + ) -> Option<&MMIODevice> { + self.virtio_devices + .get(&(virtio_type, device_id.to_string())) } /// Run fn for each registered virtio device. pub fn for_each_virtio_device(&self, mut f: F) -> Result<(), E> where - F: FnMut(u32, &String, &MMIODeviceInfo, Arc>) -> Result<(), E>, + F: FnMut(&u32, &String, &MMIODevice) -> Result<(), E>, { - self.for_each_device(|device_type, device_id, device_info, bus_device| { - if let Virtio(virtio_type) = device_type { - let virtio_device = bus_device - .lock() - .expect("Poisoned lock") - .mmio_transport_ref() - .expect("Unexpected device type") - .device(); - f(*virtio_type, device_id, device_info, virtio_device)?; - } - Ok(()) - })?; - + for ((virtio_type, device_id), mmio_device) in &self.virtio_devices { + f(virtio_type, device_id, mmio_device)?; + } Ok(()) } @@ -426,13 +417,8 @@ impl MMIODeviceManager { T: VirtioDevice + 'static + Debug, F: FnOnce(&mut T) -> Result<(), String>, { - if let Some(busdev) = self.get_device(DeviceType::Virtio(virtio_type), id) { - let virtio_device = busdev - .lock() - .expect("Poisoned lock") - .mmio_transport_ref() - .expect("Unexpected device type") - .device(); + if let Some(device) = self.get_virtio_device(virtio_type, id) { + let virtio_device = device.inner.lock().expect("Poisoned lock").device(); let mut dev = virtio_device.lock().expect("Poisoned lock"); f(dev .as_mut_any() @@ -445,136 +431,83 @@ impl MMIODeviceManager { Ok(()) } - /// Artificially kick devices as if they had external events. - pub fn kick_devices(&self) { - info!("Artificially kick devices."); - // We only kick virtio devices for now. - let _: Result<(), MmioError> = - self.for_each_virtio_device(|virtio_type, id, _info, dev| { - let mut virtio = dev.lock().expect("Poisoned lock"); - match virtio_type { - TYPE_BALLOON => { - let balloon = virtio.as_mut_any().downcast_mut::().unwrap(); - // If device is activated, kick the balloon queue(s) to make up for any - // pending or in-flight epoll events we may have not captured in snapshot. - // Stats queue doesn't need kicking as it is notified via a `timer_fd`. - if balloon.is_activated() { - info!("kick balloon {}.", id); - balloon.process_virtio_queues().unwrap(); - } - } - TYPE_BLOCK => { - // We only care about kicking virtio block. - // If we need to kick vhost-user-block we can do nothing. - if let Some(block) = virtio.as_mut_any().downcast_mut::() { - // If device is activated, kick the block queue(s) to make up for any - // pending or in-flight epoll events we may have not captured in - // snapshot. No need to kick Ratelimiters - // because they are restored 'unblocked' so - // any inflight `timer_fd` events can be safely discarded. - if block.is_activated() { - info!("kick block {}.", id); - block.process_virtio_queues().unwrap() - } - } - } - TYPE_NET => { - let net = virtio.as_mut_any().downcast_mut::().unwrap(); - // If device is activated, kick the net queue(s) to make up for any - // pending or in-flight epoll events we may have not captured in snapshot. - // No need to kick Ratelimiters because they are restored 'unblocked' so - // any inflight `timer_fd` events can be safely discarded. - if net.is_activated() { - info!("kick net {}.", id); - net.process_virtio_queues().unwrap(); - } - } - TYPE_VSOCK => { - // Vsock has complicated protocol that isn't resilient to any packet loss, - // so for Vsock we don't support connection persistence through snapshot. - // Any in-flight packets or events are simply lost. - // Vsock is restored 'empty'. - // The only reason we still `kick` it is to make guest process - // `TRANSPORT_RESET_EVENT` event we sent during snapshot creation. - let vsock = virtio - .as_mut_any() - .downcast_mut::>() - .unwrap(); - if vsock.is_activated() { - info!("kick vsock {id}."); - vsock.signal_used_queue().unwrap(); - } - } - TYPE_RNG => { - let entropy = virtio.as_mut_any().downcast_mut::().unwrap(); - if entropy.is_activated() { - info!("kick entropy {id}."); - entropy.process_virtio_queues().unwrap(); - } - } - _ => (), - } - Ok(()) - }); + #[cfg(target_arch = "aarch64")] + pub fn virtio_device_info(&self) -> Vec<&MMIODeviceInfo> { + let mut device_info = Vec::new(); + for (_, dev) in self.virtio_devices.iter() { + device_info.push(&dev.resources); + } + device_info + } + + #[cfg(target_arch = "aarch64")] + pub fn rtc_device_info(&self) -> Option<&MMIODeviceInfo> { + self.rtc.as_ref().map(|device| &device.resources) + } + + #[cfg(target_arch = "aarch64")] + pub fn serial_device_info(&self) -> Option<&MMIODeviceInfo> { + self.serial.as_ref().map(|device| &device.resources) } } #[cfg(test)] -mod tests { +pub(crate) mod tests { + use std::ops::Deref; use std::sync::Arc; use vmm_sys_util::eventfd::EventFd; use super::*; - use crate::Vm; use crate::devices::virtio::ActivateError; - use crate::devices::virtio::device::{IrqTrigger, VirtioDevice}; + use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::queue::Queue; + use crate::devices::virtio::transport::VirtioInterrupt; + use crate::devices::virtio::transport::mmio::IrqTrigger; use crate::test_utils::multi_region_mem_raw; use crate::vstate::kvm::Kvm; use crate::vstate::memory::{GuestAddress, GuestMemoryMmap}; + use crate::{Vm, arch}; const QUEUE_SIZES: &[u16] = &[64]; impl MMIODeviceManager { - fn register_virtio_test_device( + pub(crate) fn register_virtio_test_device( &mut self, - vm: &VmFd, + vm: &Vm, guest_mem: GuestMemoryMmap, - resource_allocator: &mut ResourceAllocator, device: Arc>, cmdline: &mut kernel_cmdline::Cmdline, dev_id: &str, ) -> Result { - let mmio_device = MmioTransport::new(guest_mem, device, false); - let device_info = self.register_mmio_virtio_for_boot( - vm, - resource_allocator, - dev_id.to_string(), - mmio_device, - cmdline, - )?; - Ok(device_info.addr) + let interrupt = Arc::new(IrqTrigger::new()); + let mmio_device = MmioTransport::new(guest_mem, interrupt, device.clone(), false); + self.register_mmio_virtio_for_boot(vm, dev_id.to_string(), mmio_device, cmdline)?; + Ok(self + .get_virtio_device(device.lock().unwrap().device_type(), dev_id) + .unwrap() + .resources + .addr) } #[cfg(target_arch = "x86_64")] /// Gets the number of interrupts used by the devices registered. pub fn used_irqs_count(&self) -> usize { - self.get_device_info() + self.virtio_devices .iter() - .filter(|(_, device_info)| device_info.irq.is_some()) + .filter(|(_, mmio_dev)| mmio_dev.resources.gsi.is_some()) .count() } } #[allow(dead_code)] #[derive(Debug)] - struct DummyDevice { + pub(crate) struct DummyDevice { dummy: u32, queues: Vec, queue_evts: [EventFd; 1], - interrupt_trigger: IrqTrigger, + interrupt_trigger: Option>, } impl DummyDevice { @@ -583,7 +516,7 @@ mod tests { dummy: 0, queues: QUEUE_SIZES.iter().map(|&s| Queue::new(s)).collect(), queue_evts: [EventFd::new(libc::EFD_NONBLOCK).expect("cannot create eventFD")], - interrupt_trigger: IrqTrigger::new().expect("cannot create eventFD"), + interrupt_trigger: None, } } } @@ -615,8 +548,8 @@ mod tests { &self.queue_evts } - fn interrupt_trigger(&self) -> &IrqTrigger { - &self.interrupt_trigger + fn interrupt_trigger(&self) -> &dyn VirtioInterrupt { + self.interrupt_trigger.as_ref().unwrap().deref() } fn ack_features_by_page(&mut self, page: u32, value: u32) { @@ -634,7 +567,11 @@ mod tests { let _ = data; } - fn activate(&mut self, _: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate( + &mut self, + _: GuestMemoryMmap, + _: Arc, + ) -> Result<(), ActivateError> { Ok(()) } @@ -653,7 +590,6 @@ mod tests { let mut vm = Vm::new(&kvm).unwrap(); vm.register_memory_regions(guest_mem).unwrap(); let mut device_manager = MMIODeviceManager::new(); - let mut resource_allocator = ResourceAllocator::new().unwrap(); let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); let dummy = Arc::new(Mutex::new(DummyDevice::new())); @@ -664,14 +600,30 @@ mod tests { device_manager .register_virtio_test_device( - vm.fd(), + &vm, vm.guest_memory().clone(), - &mut resource_allocator, dummy, &mut cmdline, "dummy", ) .unwrap(); + + assert!(device_manager.get_virtio_device(0, "foo").is_none()); + let dev = device_manager.get_virtio_device(0, "dummy").unwrap(); + assert_eq!(dev.resources.addr, arch::MEM_32BIT_DEVICES_START); + assert_eq!(dev.resources.len, MMIO_LEN); + assert_eq!(dev.resources.gsi, Some(arch::GSI_LEGACY_START)); + + device_manager + .for_each_virtio_device(|virtio_type, device_id, mmio_device| { + assert_eq!(*virtio_type, 0); + assert_eq!(device_id, "dummy"); + assert_eq!(mmio_device.resources.addr, arch::MEM_32BIT_DEVICES_START); + assert_eq!(mmio_device.resources.len, MMIO_LEN); + assert_eq!(mmio_device.resources.gsi, Some(arch::GSI_LEGACY_START)); + Ok::<(), ()>(()) + }) + .unwrap(); } #[test] @@ -684,7 +636,6 @@ mod tests { let mut vm = Vm::new(&kvm).unwrap(); vm.register_memory_regions(guest_mem).unwrap(); let mut device_manager = MMIODeviceManager::new(); - let mut resource_allocator = ResourceAllocator::new().unwrap(); let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); #[cfg(target_arch = "x86_64")] @@ -692,12 +643,11 @@ mod tests { #[cfg(target_arch = "aarch64")] vm.setup_irqchip(1).unwrap(); - for _i in crate::arch::GSI_BASE..=crate::arch::GSI_MAX { + for _i in crate::arch::GSI_LEGACY_START..=crate::arch::GSI_LEGACY_END { device_manager .register_virtio_test_device( - vm.fd(), + &vm, vm.guest_memory().clone(), - &mut resource_allocator, Arc::new(Mutex::new(DummyDevice::new())), &mut cmdline, "dummy1", @@ -709,9 +659,8 @@ mod tests { "{}", device_manager .register_virtio_test_device( - vm.fd(), + &vm, vm.guest_memory().clone(), - &mut resource_allocator, Arc::new(Mutex::new(DummyDevice::new())), &mut cmdline, "dummy2" @@ -746,68 +695,49 @@ mod tests { vm.setup_irqchip(1).unwrap(); let mut device_manager = MMIODeviceManager::new(); - let mut resource_allocator = ResourceAllocator::new().unwrap(); let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); let dummy = Arc::new(Mutex::new(DummyDevice::new())); let type_id = dummy.lock().unwrap().device_type(); let id = String::from("foo"); let addr = device_manager - .register_virtio_test_device( - vm.fd(), - vm.guest_memory().clone(), - &mut resource_allocator, - dummy, - &mut cmdline, - &id, - ) + .register_virtio_test_device(&vm, vm.guest_memory().clone(), dummy, &mut cmdline, &id) .unwrap(); - assert!( - device_manager - .get_device(DeviceType::Virtio(type_id), &id) - .is_some() - ); + assert!(device_manager.get_virtio_device(type_id, &id).is_some()); assert_eq!( addr, - device_manager.id_to_dev_info[&(DeviceType::Virtio(type_id), id.clone())].addr + device_manager.virtio_devices[&(type_id, id.clone())] + .resources + .addr ); assert_eq!( - crate::arch::GSI_BASE, - device_manager.id_to_dev_info[&(DeviceType::Virtio(type_id), id)] - .irq + crate::arch::GSI_LEGACY_START, + device_manager.virtio_devices[&(type_id, id)] + .resources + .gsi .unwrap() ); let id = "bar"; - assert!( - device_manager - .get_device(DeviceType::Virtio(type_id), id) - .is_none() - ); + assert!(device_manager.get_virtio_device(type_id, id).is_none()); let dummy2 = Arc::new(Mutex::new(DummyDevice::new())); let id2 = String::from("foo2"); device_manager - .register_virtio_test_device( - vm.fd(), - vm.guest_memory().clone(), - &mut resource_allocator, - dummy2, - &mut cmdline, - &id2, - ) + .register_virtio_test_device(&vm, vm.guest_memory().clone(), dummy2, &mut cmdline, &id2) .unwrap(); let mut count = 0; - let _: Result<(), MmioError> = device_manager.for_each_device(|devtype, devid, _, _| { - assert_eq!(*devtype, DeviceType::Virtio(type_id)); - match devid.as_str() { - "foo" => count += 1, - "foo2" => count += 2, - _ => unreachable!(), - }; - Ok(()) - }); + let _: Result<(), MmioError> = + device_manager.for_each_virtio_device(|devtype, devid, _| { + assert_eq!(*devtype, type_id); + match devid.as_str() { + "foo" => count += 1, + "foo2" => count += 2, + _ => unreachable!(), + }; + Ok(()) + }); assert_eq!(count, 3); #[cfg(target_arch = "x86_64")] assert_eq!(device_manager.used_irqs_count(), 2); @@ -816,29 +746,29 @@ mod tests { #[test] fn test_no_irq_allocation() { let mut device_manager = MMIODeviceManager::new(); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); let device_info = device_manager .allocate_mmio_resources(&mut resource_allocator, 0) .unwrap(); - assert!(device_info.irq.is_none()); + assert!(device_info.gsi.is_none()); } #[test] fn test_irq_allocation() { let mut device_manager = MMIODeviceManager::new(); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); let device_info = device_manager .allocate_mmio_resources(&mut resource_allocator, 1) .unwrap(); - assert_eq!(device_info.irq.unwrap(), crate::arch::GSI_BASE); + assert_eq!(device_info.gsi.unwrap(), crate::arch::GSI_LEGACY_START); } #[test] fn test_allocation_failure() { let mut device_manager = MMIODeviceManager::new(); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); assert_eq!( format!( "{}", diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index bc16604b645..c7f6acabfe1 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -5,13 +5,612 @@ // Use of this source code is governed by a BSD-style license that can be // found in the THIRD-PARTY file. +use std::convert::Infallible; +use std::fmt::Debug; +use std::sync::{Arc, Mutex}; + +use acpi::ACPIDeviceManager; +use event_manager::{MutEventSubscriber, SubscriberOps}; +#[cfg(target_arch = "x86_64")] +use legacy::{LegacyDeviceError, PortIODeviceManager}; +use linux_loader::loader::Cmdline; +use log::{error, info}; +use mmio::{MMIODeviceManager, MmioError}; +use pci_mngr::{PciDevices, PciDevicesConstructorArgs, PciManagerError}; +use persist::{ACPIDeviceManagerConstructorArgs, MMIODevManagerConstructorArgs}; +use serde::{Deserialize, Serialize}; +use utils::time::TimestampUs; +use vmm_sys_util::eventfd::EventFd; + +use crate::devices::acpi::vmgenid::{VmGenId, VmGenIdError}; +#[cfg(target_arch = "x86_64")] +use crate::devices::legacy::I8042Device; +#[cfg(target_arch = "aarch64")] +use crate::devices::legacy::RTCDevice; +use crate::devices::legacy::serial::SerialOut; +use crate::devices::legacy::{IER_RDA_BIT, IER_RDA_OFFSET, SerialDevice}; +use crate::devices::pseudo::BootTimer; +use crate::devices::virtio::device::VirtioDevice; +use crate::devices::virtio::transport::mmio::{IrqTrigger, MmioTransport}; +use crate::resources::VmResources; +use crate::snapshot::Persist; +use crate::vstate::memory::GuestMemoryMmap; +use crate::{EmulateSerialInitError, EventManager, Vm}; + /// ACPI device manager. pub mod acpi; /// Legacy Device Manager. pub mod legacy; /// Memory Mapped I/O Manager. pub mod mmio; +/// PCIe device manager +pub mod pci_mngr; /// Device managers (de)serialization support. pub mod persist; -/// Resource manager for devices. -pub mod resources; + +#[derive(Debug, thiserror::Error, displaydoc::Display)] +/// Error while creating a new [`DeviceManager`] +pub enum DeviceManagerCreateError { + /// Error with EventFd: {0} + EventFd(#[from] std::io::Error), + #[cfg(target_arch = "x86_64")] + /// Legacy device manager error: {0} + PortIOError(#[from] LegacyDeviceError), + /// Resource allocator error: {0} + ResourceAllocator(#[from] vm_allocator::Error), +} + +#[derive(Debug, thiserror::Error, displaydoc::Display)] +/// Error while attaching a VirtIO device +pub enum AttachDeviceError { + /// MMIO transport error: {0} + MmioTransport(#[from] MmioError), + /// Error inserting device in bus: {0} + Bus(#[from] vm_device::BusError), + /// Error creating VMGenID device: {0} + CreateVmGenID(#[from] VmGenIdError), + /// Error while registering VMGenID with KVM: {0} + AttachVmGenID(#[from] kvm_ioctls::Error), + #[cfg(target_arch = "aarch64")] + /// Cmdline error + Cmdline, + #[cfg(target_arch = "aarch64")] + /// Error creating serial device: {0} + CreateSerial(#[from] std::io::Error), + /// Error attach PCI device: {0} + PciTransport(#[from] PciManagerError), +} + +#[derive(Debug, thiserror::Error, displaydoc::Display)] +/// Error while searching for a VirtIO device +pub enum FindDeviceError { + /// Device type is invalid + InvalidDeviceType, + /// Device not found + DeviceNotFound, + /// Internal Device error: {0} + InternalDeviceError(String), +} + +#[derive(Debug)] +/// A manager of all peripheral devices of Firecracker +pub struct DeviceManager { + /// MMIO devices + pub mmio_devices: MMIODeviceManager, + #[cfg(target_arch = "x86_64")] + /// Legacy devices + pub legacy_devices: PortIODeviceManager, + /// ACPI devices + pub acpi_devices: ACPIDeviceManager, + /// PCIe devices + pub pci_devices: PciDevices, +} + +impl DeviceManager { + // Adds `O_NONBLOCK` to the stdout flags. + fn set_stdout_nonblocking() { + // SAFETY: Call is safe since parameters are valid. + let flags = unsafe { libc::fcntl(libc::STDOUT_FILENO, libc::F_GETFL, 0) }; + if flags < 0 { + error!("Could not get Firecracker stdout flags."); + } + // SAFETY: Call is safe since parameters are valid. + let rc = + unsafe { libc::fcntl(libc::STDOUT_FILENO, libc::F_SETFL, flags | libc::O_NONBLOCK) }; + if rc < 0 { + error!("Could not set Firecracker stdout to non-blocking."); + } + } + + /// Sets up the serial device. + fn setup_serial_device( + event_manager: &mut EventManager, + ) -> Result>, std::io::Error> { + let serial = Arc::new(Mutex::new(SerialDevice::new( + Some(std::io::stdin()), + SerialOut::Stdout(std::io::stdout()), + )?)); + event_manager.add_subscriber(serial.clone()); + Ok(serial) + } + + #[cfg(target_arch = "x86_64")] + fn create_legacy_devices( + event_manager: &mut EventManager, + vcpus_exit_evt: &EventFd, + vm: &Vm, + ) -> Result { + Self::set_stdout_nonblocking(); + + // Create serial device + let serial = Self::setup_serial_device(event_manager)?; + let reset_evt = vcpus_exit_evt + .try_clone() + .map_err(DeviceManagerCreateError::EventFd)?; + // Create keyboard emulator for reset event + let i8042 = Arc::new(Mutex::new(I8042Device::new(reset_evt)?)); + + // create pio dev manager with legacy devices + let mut legacy_devices = PortIODeviceManager::new(serial, i8042)?; + legacy_devices.register_devices(vm)?; + Ok(legacy_devices) + } + + #[cfg_attr(target_arch = "aarch64", allow(unused))] + pub fn new( + event_manager: &mut EventManager, + vcpus_exit_evt: &EventFd, + vm: &Vm, + ) -> Result { + #[cfg(target_arch = "x86_64")] + let legacy_devices = Self::create_legacy_devices(event_manager, vcpus_exit_evt, vm)?; + + Ok(DeviceManager { + mmio_devices: MMIODeviceManager::new(), + #[cfg(target_arch = "x86_64")] + legacy_devices, + acpi_devices: ACPIDeviceManager::new(), + pci_devices: PciDevices::new(), + }) + } + + /// Attaches an MMIO VirtioDevice device to the device manager and event manager. + pub(crate) fn attach_mmio_virtio_device< + T: 'static + VirtioDevice + MutEventSubscriber + Debug, + >( + &mut self, + vm: &Vm, + id: String, + device: Arc>, + cmdline: &mut Cmdline, + is_vhost_user: bool, + ) -> Result<(), AttachDeviceError> { + let interrupt = Arc::new(IrqTrigger::new()); + // The device mutex mustn't be locked here otherwise it will deadlock. + let device = + MmioTransport::new(vm.guest_memory().clone(), interrupt, device, is_vhost_user); + self.mmio_devices + .register_mmio_virtio_for_boot(vm, id, device, cmdline)?; + + Ok(()) + } + + /// Attaches a VirtioDevice device to the device manager and event manager. + pub(crate) fn attach_virtio_device( + &mut self, + vm: &Arc, + id: String, + device: Arc>, + cmdline: &mut Cmdline, + is_vhost_user: bool, + ) -> Result<(), AttachDeviceError> { + if self.pci_devices.pci_segment.is_some() { + self.pci_devices.attach_pci_virtio_device(vm, id, device)?; + } else { + self.attach_mmio_virtio_device(vm, id, device, cmdline, is_vhost_user)?; + } + + Ok(()) + } + + /// Attaches a [`BootTimer`] to the VM + pub(crate) fn attach_boot_timer_device( + &mut self, + vm: &Vm, + request_ts: TimestampUs, + ) -> Result<(), AttachDeviceError> { + let boot_timer = Arc::new(Mutex::new(BootTimer::new(request_ts))); + + self.mmio_devices + .register_mmio_boot_timer(&vm.common.mmio_bus, boot_timer)?; + + Ok(()) + } + + pub(crate) fn attach_vmgenid_device( + &mut self, + mem: &GuestMemoryMmap, + vm: &Vm, + ) -> Result<(), AttachDeviceError> { + let vmgenid = VmGenId::new(mem, &mut vm.resource_allocator())?; + self.acpi_devices.attach_vmgenid(vmgenid, vm)?; + Ok(()) + } + + #[cfg(target_arch = "aarch64")] + pub(crate) fn attach_legacy_devices_aarch64( + &mut self, + vm: &Vm, + event_manager: &mut EventManager, + cmdline: &mut Cmdline, + ) -> Result<(), AttachDeviceError> { + // Serial device setup. + let cmdline_contains_console = cmdline + .as_cstring() + .map_err(|_| AttachDeviceError::Cmdline)? + .into_string() + .map_err(|_| AttachDeviceError::Cmdline)? + .contains("console="); + + if cmdline_contains_console { + // Make stdout non-blocking. + Self::set_stdout_nonblocking(); + let serial = Self::setup_serial_device(event_manager)?; + self.mmio_devices.register_mmio_serial(vm, serial, None)?; + self.mmio_devices.add_mmio_serial_to_cmdline(cmdline)?; + } + + let rtc = Arc::new(Mutex::new(RTCDevice::new())); + self.mmio_devices.register_mmio_rtc(vm, rtc, None)?; + Ok(()) + } + + /// Enables PCIe support for Firecracker devices + pub fn enable_pci(&mut self, vm: &Arc) -> Result<(), PciManagerError> { + self.pci_devices.attach_pci_segment(vm) + } + + /// Artificially kick VirtIO devices as if they had external events. + pub fn kick_virtio_devices(&self) { + info!("Artificially kick devices"); + // Go through MMIO VirtIO devices + let _: Result<(), MmioError> = self.mmio_devices.for_each_virtio_device(|_, _, device| { + let mmio_transport_locked = device.inner.lock().expect("Poisoned lock"); + mmio_transport_locked + .device() + .lock() + .expect("Poisoned lock") + .kick(); + Ok(()) + }); + // Go through PCI VirtIO devices + for virtio_pci_device in self.pci_devices.virtio_devices.values() { + virtio_pci_device + .lock() + .expect("Poisoned lock") + .virtio_device() + .lock() + .expect("Poisoned lock") + .kick(); + } + } + + fn do_mark_virtio_queue_memory_dirty( + device: Arc>, + mem: &GuestMemoryMmap, + ) { + // SAFETY: + // This should never fail as we mark pages only if device has already been activated, + // and the address validation was already performed on device activation. + let mut locked_device = device.lock().expect("Poisoned lock"); + if locked_device.is_activated() { + locked_device.mark_queue_memory_dirty(mem).unwrap() + } + } + + /// Mark queue memory dirty for activated VirtIO devices + pub fn mark_virtio_queue_memory_dirty(&self, mem: &GuestMemoryMmap) { + // Go through MMIO VirtIO devices + let _: Result<(), Infallible> = self.mmio_devices.for_each_virtio_device(|_, _, device| { + let mmio_transport_locked = device.inner.lock().expect("Poisoned locked"); + Self::do_mark_virtio_queue_memory_dirty(mmio_transport_locked.device(), mem); + Ok(()) + }); + + // Go through PCI VirtIO devices + for device in self.pci_devices.virtio_devices.values() { + let virtio_device = device.lock().expect("Poisoned lock").virtio_device(); + Self::do_mark_virtio_queue_memory_dirty(virtio_device, mem); + } + } + + /// Get a VirtIO device of type `virtio_type` with ID `device_id` + pub fn get_virtio_device( + &self, + virtio_type: u32, + device_id: &str, + ) -> Option>> { + if self.pci_devices.pci_segment.is_some() { + let pci_device = self.pci_devices.get_virtio_device(virtio_type, device_id)?; + Some( + pci_device + .lock() + .expect("Poisoned lock") + .virtio_device() + .clone(), + ) + } else { + let mmio_device = self + .mmio_devices + .get_virtio_device(virtio_type, device_id)?; + Some( + mmio_device + .inner + .lock() + .expect("Poisoned lock") + .device() + .clone(), + ) + } + } + + /// Run fn `f()` for the virtio device matching `virtio_type` and `id`. + pub fn with_virtio_device_with_id( + &self, + virtio_type: u32, + id: &str, + f: F, + ) -> Result<(), FindDeviceError> + where + T: VirtioDevice + 'static + Debug, + F: FnOnce(&mut T) -> Result<(), String>, + { + if let Some(device) = self.get_virtio_device(virtio_type, id) { + let mut dev = device.lock().expect("Poisoned lock"); + f(dev + .as_mut_any() + .downcast_mut::() + .ok_or(FindDeviceError::InvalidDeviceType)?) + .map_err(FindDeviceError::InternalDeviceError)?; + } else { + return Err(FindDeviceError::DeviceNotFound); + } + Ok(()) + } +} + +#[derive(Debug, Default, Clone, Serialize, Deserialize)] +/// State of devices in the system +pub struct DevicesState { + /// MMIO devices state + pub mmio_state: persist::DeviceStates, + /// ACPI devices state + pub acpi_state: persist::ACPIDeviceManagerState, + /// PCI devices state + pub pci_state: pci_mngr::PciDevicesState, +} + +#[derive(Debug, thiserror::Error, displaydoc::Display)] +pub enum DevicePersistError { + /// Error restoring MMIO devices: {0} + MmioRestore(#[from] persist::DevicePersistError), + /// Error restoring ACPI devices: {0} + AcpiRestore(#[from] persist::ACPIDeviceManagerRestoreError), + /// Error restoring PCI devices: {0} + PciRestore(#[from] PciManagerError), + /// Error notifying VMGenID device: {0} + VmGenidUpdate(#[from] std::io::Error), + /// Error resetting serial console: {0} + SerialRestore(#[from] EmulateSerialInitError), + /// Error inserting device in bus: {0} + Bus(#[from] vm_device::BusError), + /// Error creating DeviceManager: {0} + DeviceManager(#[from] DeviceManagerCreateError), +} + +pub struct DeviceRestoreArgs<'a> { + pub mem: &'a GuestMemoryMmap, + pub vm: &'a Arc, + pub event_manager: &'a mut EventManager, + pub vcpus_exit_evt: &'a EventFd, + pub vm_resources: &'a mut VmResources, + pub instance_id: &'a str, + pub restored_from_file: bool, +} + +impl std::fmt::Debug for DeviceRestoreArgs<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("DeviceRestoreArgs") + .field("mem", &self.mem) + .field("vm", &self.vm) + .field("vm_resources", &self.vm_resources) + .field("instance_id", &self.instance_id) + .field("restored_from_file", &self.restored_from_file) + .finish() + } +} + +impl<'a> Persist<'a> for DeviceManager { + type State = DevicesState; + type ConstructorArgs = DeviceRestoreArgs<'a>; + type Error = DevicePersistError; + + fn save(&self) -> Self::State { + DevicesState { + mmio_state: self.mmio_devices.save(), + acpi_state: self.acpi_devices.save(), + pci_state: self.pci_devices.save(), + } + } + + fn restore( + constructor_args: Self::ConstructorArgs, + state: &Self::State, + ) -> std::result::Result { + // Setup legacy devices in case of x86 + #[cfg(target_arch = "x86_64")] + let legacy_devices = Self::create_legacy_devices( + constructor_args.event_manager, + constructor_args.vcpus_exit_evt, + constructor_args.vm, + )?; + + // Restore MMIO devices + let mmio_ctor_args = MMIODevManagerConstructorArgs { + mem: constructor_args.mem, + vm: constructor_args.vm, + event_manager: constructor_args.event_manager, + vm_resources: constructor_args.vm_resources, + instance_id: constructor_args.instance_id, + restored_from_file: constructor_args.restored_from_file, + }; + let mmio_devices = MMIODeviceManager::restore(mmio_ctor_args, &state.mmio_state)?; + + // Restore ACPI devices + let acpi_ctor_args = ACPIDeviceManagerConstructorArgs { + mem: constructor_args.mem, + vm: constructor_args.vm, + }; + let mut acpi_devices = ACPIDeviceManager::restore(acpi_ctor_args, &state.acpi_state)?; + acpi_devices.notify_vmgenid()?; + + // Restore PCI devices + let pci_ctor_args = PciDevicesConstructorArgs { + vm: constructor_args.vm.clone(), + mem: constructor_args.mem, + vm_resources: constructor_args.vm_resources, + instance_id: constructor_args.instance_id, + restored_from_file: constructor_args.restored_from_file, + event_manager: constructor_args.event_manager, + }; + let pci_devices = PciDevices::restore(pci_ctor_args, &state.pci_state)?; + + let device_manager = DeviceManager { + mmio_devices, + #[cfg(target_arch = "x86_64")] + legacy_devices, + acpi_devices, + pci_devices, + }; + + // Restore serial. + // We need to do that after we restore mmio devices, otherwise it won't succeed in Aarch64 + device_manager.emulate_serial_init()?; + + Ok(device_manager) + } +} + +impl DeviceManager { + /// Sets RDA bit in serial console + pub fn emulate_serial_init(&self) -> Result<(), EmulateSerialInitError> { + // When restoring from a previously saved state, there is no serial + // driver initialization, therefore the RDA (Received Data Available) + // interrupt is not enabled. Because of that, the driver won't get + // notified of any bytes that we send to the guest. The clean solution + // would be to save the whole serial device state when we do the vm + // serialization. For now we set that bit manually + + #[cfg(target_arch = "aarch64")] + { + if let Some(device) = &self.mmio_devices.serial { + let mut device_locked = device.inner.lock().expect("Poisoned lock"); + + device_locked + .serial + .write(IER_RDA_OFFSET, IER_RDA_BIT) + .map_err(|_| EmulateSerialInitError(std::io::Error::last_os_error()))?; + } + Ok(()) + } + + #[cfg(target_arch = "x86_64")] + { + let mut serial = self + .legacy_devices + .stdio_serial + .lock() + .expect("Poisoned lock"); + + serial + .serial + .write(IER_RDA_OFFSET, IER_RDA_BIT) + .map_err(|_| EmulateSerialInitError(std::io::Error::last_os_error()))?; + Ok(()) + } + } +} + +#[cfg(test)] +pub(crate) mod tests { + use super::*; + #[cfg(target_arch = "aarch64")] + use crate::builder::tests::default_vmm; + + pub(crate) fn default_device_manager() -> DeviceManager { + let mmio_devices = MMIODeviceManager::new(); + let acpi_devices = ACPIDeviceManager::new(); + let pci_devices = PciDevices::new(); + + #[cfg(target_arch = "x86_64")] + let legacy_devices = PortIODeviceManager::new( + Arc::new(Mutex::new( + SerialDevice::new(None, SerialOut::Sink(std::io::sink())).unwrap(), + )), + Arc::new(Mutex::new( + I8042Device::new(EventFd::new(libc::EFD_NONBLOCK).unwrap()).unwrap(), + )), + ) + .unwrap(); + + DeviceManager { + mmio_devices, + #[cfg(target_arch = "x86_64")] + legacy_devices, + acpi_devices, + pci_devices, + } + } + + #[cfg(target_arch = "aarch64")] + #[test] + fn test_attach_legacy_serial() { + let mut vmm = default_vmm(); + assert!(vmm.device_manager.mmio_devices.rtc.is_none()); + assert!(vmm.device_manager.mmio_devices.serial.is_none()); + + let mut cmdline = Cmdline::new(4096).unwrap(); + let mut event_manager = EventManager::new().unwrap(); + vmm.device_manager + .attach_legacy_devices_aarch64(&vmm.vm, &mut event_manager, &mut cmdline) + .unwrap(); + assert!(vmm.device_manager.mmio_devices.rtc.is_some()); + assert!(vmm.device_manager.mmio_devices.serial.is_none()); + + let mut vmm = default_vmm(); + cmdline.insert("console", "/dev/blah").unwrap(); + vmm.device_manager + .attach_legacy_devices_aarch64(&vmm.vm, &mut event_manager, &mut cmdline) + .unwrap(); + assert!(vmm.device_manager.mmio_devices.rtc.is_some()); + assert!(vmm.device_manager.mmio_devices.serial.is_some()); + + assert!( + cmdline + .as_cstring() + .unwrap() + .into_string() + .unwrap() + .contains(&format!( + "earlycon=uart,mmio,0x{:08x}", + vmm.device_manager + .mmio_devices + .serial + .as_ref() + .unwrap() + .resources + .addr + )) + ); + } +} diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs new file mode 100644 index 00000000000..578d521162b --- /dev/null +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -0,0 +1,756 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::collections::HashMap; +use std::fmt::Debug; +use std::ops::DerefMut; +use std::sync::{Arc, Mutex}; + +use event_manager::{MutEventSubscriber, SubscriberOps}; +use log::{debug, error, warn}; +use pci::{PciBarRegionType, PciDevice, PciDeviceError, PciRootError}; +use serde::{Deserialize, Serialize}; +use vm_device::BusError; + +use super::persist::{MmdsState, SharedDeviceType}; +use crate::devices::pci::PciSegment; +use crate::devices::virtio::balloon::Balloon; +use crate::devices::virtio::balloon::persist::{BalloonConstructorArgs, BalloonState}; +use crate::devices::virtio::block::device::Block; +use crate::devices::virtio::block::persist::{BlockConstructorArgs, BlockState}; +use crate::devices::virtio::device::VirtioDevice; +use crate::devices::virtio::net::Net; +use crate::devices::virtio::net::persist::{NetConstructorArgs, NetState}; +use crate::devices::virtio::rng::Entropy; +use crate::devices::virtio::rng::persist::{EntropyConstructorArgs, EntropyState}; +use crate::devices::virtio::transport::pci::device::{ + VirtioPciDevice, VirtioPciDeviceError, VirtioPciDeviceState, +}; +use crate::devices::virtio::vsock::persist::{ + VsockConstructorArgs, VsockState, VsockUdsConstructorArgs, +}; +use crate::devices::virtio::vsock::{TYPE_VSOCK, Vsock, VsockUnixBackend}; +use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET, TYPE_RNG}; +use crate::resources::VmResources; +use crate::snapshot::Persist; +use crate::vmm_config::mmds::MmdsConfigError; +use crate::vstate::memory::GuestMemoryMmap; +use crate::vstate::vm::{InterruptError, MsiVectorGroup}; +use crate::{EventManager, Vm}; + +#[derive(Debug, Default)] +pub struct PciDevices { + /// PCIe segment of the VMM, if PCI is enabled. We currently support a single PCIe segment. + pub pci_segment: Option, + /// All VirtIO PCI devices of the system + pub virtio_devices: HashMap<(u32, String), Arc>>, +} + +#[derive(Debug, thiserror::Error, displaydoc::Display)] +pub enum PciManagerError { + /// Resource allocation error: {0} + ResourceAllocation(#[from] vm_allocator::Error), + /// Bus error: {0} + Bus(#[from] BusError), + /// PCI root error: {0} + PciRoot(#[from] PciRootError), + /// MSI error: {0} + Msi(#[from] InterruptError), + /// VirtIO PCI device error: {0} + VirtioPciDevice(#[from] VirtioPciDeviceError), + /// PCI device error: {0} + PciDeviceError(#[from] PciDeviceError), + /// KVM error: {0} + Kvm(#[from] vmm_sys_util::errno::Error), + /// MMDS error: {0} + Mmds(#[from] MmdsConfigError), +} + +impl PciDevices { + pub fn new() -> Self { + Default::default() + } + + pub fn attach_pci_segment(&mut self, vm: &Arc) -> Result<(), PciManagerError> { + // We only support a single PCIe segment. Calling this function twice is a Firecracker + // internal error. + assert!(self.pci_segment.is_none()); + + // Currently we don't assign any IRQs to PCI devices. We will be using MSI-X interrupts + // only. + let pci_segment = PciSegment::new(0, vm, &[0u8; 32])?; + self.pci_segment = Some(pci_segment); + + Ok(()) + } + + fn register_bars_with_bus( + vm: &Vm, + virtio_device: &Arc>, + ) -> Result<(), PciManagerError> { + let virtio_device_locked = virtio_device.lock().expect("Poisoned lock"); + let bar = &virtio_device_locked.bar_region; + assert_eq!(bar.region_type, PciBarRegionType::Memory64BitRegion); + + debug!("Inserting MMIO BAR region: {:#x}:{:#x}", bar.addr, bar.size); + vm.common + .mmio_bus + .insert(virtio_device.clone(), bar.addr, bar.size)?; + + Ok(()) + } + + pub(crate) fn attach_pci_virtio_device< + T: 'static + VirtioDevice + MutEventSubscriber + Debug, + >( + &mut self, + vm: &Arc, + id: String, + device: Arc>, + ) -> Result<(), PciManagerError> { + // We should only be reaching this point if PCI is enabled + let pci_segment = self.pci_segment.as_ref().unwrap(); + let pci_device_bdf = pci_segment.next_device_bdf()?; + debug!("Allocating BDF: {pci_device_bdf:?} for device"); + let mem = vm.guest_memory().clone(); + + let device_type: u32 = device.lock().expect("Poisoned lock").device_type(); + + // Allocate one MSI vector per queue, plus one for configuration + let msix_num = + u16::try_from(device.lock().expect("Poisoned lock").queues().len() + 1).unwrap(); + + let msix_vectors = Arc::new(Vm::create_msix_group(vm.clone(), msix_num)?); + + // Create the transport + let mut virtio_device = + VirtioPciDevice::new(id.clone(), mem, device, msix_vectors, pci_device_bdf.into())?; + + // Allocate bars + let mut resource_allocator_lock = vm.resource_allocator(); + let resource_allocator = resource_allocator_lock.deref_mut(); + + virtio_device.allocate_bars( + &mut resource_allocator.mmio32_memory, + &mut resource_allocator.mmio64_memory, + )?; + + let virtio_device = Arc::new(Mutex::new(virtio_device)); + pci_segment + .pci_bus + .lock() + .expect("Poisoned lock") + .add_device(pci_device_bdf.device() as u32, virtio_device.clone())?; + + self.virtio_devices + .insert((device_type, id.clone()), virtio_device.clone()); + + Self::register_bars_with_bus(vm, &virtio_device)?; + virtio_device + .lock() + .expect("Poisoned lock") + .register_notification_ioevent(vm)?; + + Ok(()) + } + + fn restore_pci_device( + &mut self, + vm: &Arc, + device: Arc>, + device_id: &str, + transport_state: &VirtioPciDeviceState, + event_manager: &mut EventManager, + ) -> Result<(), PciManagerError> { + // We should only be reaching this point if PCI is enabled + let pci_segment = self.pci_segment.as_ref().unwrap(); + let msi_vector_group = Arc::new(MsiVectorGroup::restore( + vm.clone(), + &transport_state.msi_vector_group, + )?); + let device_type: u32 = device.lock().expect("Poisoned lock").device_type(); + + let virtio_device = Arc::new(Mutex::new(VirtioPciDevice::new_from_state( + device_id.to_string(), + vm.guest_memory().clone(), + device.clone(), + msi_vector_group, + transport_state.clone(), + )?)); + + pci_segment + .pci_bus + .lock() + .expect("Poisoned lock") + .add_device( + transport_state.pci_device_bdf.device() as u32, + virtio_device.clone(), + )?; + + self.virtio_devices + .insert((device_type, device_id.to_string()), virtio_device.clone()); + + Self::register_bars_with_bus(vm, &virtio_device)?; + virtio_device + .lock() + .expect("Poisoned lock") + .register_notification_ioevent(vm)?; + + event_manager.add_subscriber(device); + + Ok(()) + } + + /// Gets the specified device. + pub fn get_virtio_device( + &self, + device_type: u32, + device_id: &str, + ) -> Option<&Arc>> { + self.virtio_devices + .get(&(device_type, device_id.to_string())) + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VirtioDeviceState { + /// Device identifier + pub device_id: String, + /// Device BDF + pub pci_device_bdf: u32, + /// Device state + pub device_state: T, + /// Transport state + pub transport_state: VirtioPciDeviceState, +} + +#[derive(Default, Debug, Clone, Serialize, Deserialize)] +pub struct PciDevicesState { + /// Whether PCI is enabled + pub pci_enabled: bool, + /// Block device states. + pub block_devices: Vec>, + /// Net device states. + pub net_devices: Vec>, + /// Vsock device state. + pub vsock_device: Option>, + /// Balloon device state. + pub balloon_device: Option>, + /// Mmds state. + pub mmds: Option, + /// Entropy device state. + pub entropy_device: Option>, +} + +pub struct PciDevicesConstructorArgs<'a> { + pub vm: Arc, + pub mem: &'a GuestMemoryMmap, + pub vm_resources: &'a mut VmResources, + pub instance_id: &'a str, + pub restored_from_file: bool, + pub event_manager: &'a mut EventManager, +} + +impl<'a> Debug for PciDevicesConstructorArgs<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("PciDevicesConstructorArgs") + .field("vm", &self.vm) + .field("mem", &self.mem) + .field("vm_resources", &self.vm_resources) + .field("instance_id", &self.instance_id) + .field("restored_from_file", &self.restored_from_file) + .finish() + } +} + +impl<'a> Persist<'a> for PciDevices { + type State = PciDevicesState; + type ConstructorArgs = PciDevicesConstructorArgs<'a>; + type Error = PciManagerError; + + fn save(&self) -> Self::State { + let mut state = PciDevicesState::default(); + if self.pci_segment.is_some() { + state.pci_enabled = true; + } else { + return state; + } + + for pci_dev in self.virtio_devices.values() { + let locked_pci_dev = pci_dev.lock().expect("Poisoned lock"); + let transport_state = locked_pci_dev.state(); + let virtio_dev = locked_pci_dev.virtio_device(); + let mut locked_virtio_dev = virtio_dev.lock().expect("Poisoned lock"); + + let pci_device_bdf = transport_state.pci_device_bdf.into(); + + match locked_virtio_dev.device_type() { + TYPE_BALLOON => { + let balloon_device = locked_virtio_dev + .as_any() + .downcast_ref::() + .unwrap(); + + let device_state = balloon_device.save(); + + state.balloon_device = Some(VirtioDeviceState { + device_id: balloon_device.id().to_string(), + pci_device_bdf, + device_state, + transport_state, + }); + } + TYPE_BLOCK => { + let block_dev = locked_virtio_dev + .as_mut_any() + .downcast_mut::() + .unwrap(); + if block_dev.is_vhost_user() { + warn!( + "Skipping vhost-user-block device. VhostUserBlock does not support \ + snapshotting yet" + ); + } else { + block_dev.prepare_save(); + let device_state = block_dev.save(); + state.block_devices.push(VirtioDeviceState { + device_id: block_dev.id().to_string(), + pci_device_bdf, + device_state, + transport_state, + }); + } + } + TYPE_NET => { + let net_dev = locked_virtio_dev + .as_mut_any() + .downcast_mut::() + .unwrap(); + if let (Some(mmds_ns), None) = (net_dev.mmds_ns.as_ref(), state.mmds.as_ref()) { + let mmds_guard = mmds_ns.mmds.lock().expect("Poisoned lock"); + state.mmds = Some(MmdsState { + version: mmds_guard.version(), + imds_compat: mmds_guard.imds_compat(), + }); + } + net_dev.prepare_save(); + let device_state = net_dev.save(); + + state.net_devices.push(VirtioDeviceState { + device_id: net_dev.id().to_string(), + pci_device_bdf, + device_state, + transport_state, + }) + } + TYPE_VSOCK => { + let vsock_dev = locked_virtio_dev + .as_mut_any() + // Currently, VsockUnixBackend is the only implementation of VsockBackend. + .downcast_mut::>() + .unwrap(); + + // Send Transport event to reset connections if device + // is activated. + if vsock_dev.is_activated() { + vsock_dev + .send_transport_reset_event() + .unwrap_or_else(|err| { + error!("Failed to send reset transport event: {:?}", err); + }); + } + + // Save state after potential notification to the guest. This + // way we save changes to the queue the notification can cause. + let vsock_state = VsockState { + backend: vsock_dev.backend().save(), + frontend: vsock_dev.save(), + }; + + state.vsock_device = Some(VirtioDeviceState { + device_id: vsock_dev.id().to_string(), + pci_device_bdf, + device_state: vsock_state, + transport_state, + }); + } + TYPE_RNG => { + let rng_dev = locked_virtio_dev + .as_mut_any() + .downcast_mut::() + .unwrap(); + let device_state = rng_dev.save(); + + state.entropy_device = Some(VirtioDeviceState { + device_id: rng_dev.id().to_string(), + pci_device_bdf, + device_state, + transport_state, + }) + } + _ => unreachable!(), + } + } + + state + } + + fn restore( + constructor_args: Self::ConstructorArgs, + state: &Self::State, + ) -> std::result::Result { + let mem = constructor_args.mem; + let mut pci_devices = PciDevices::new(); + if !state.pci_enabled { + return Ok(pci_devices); + } + + pci_devices.attach_pci_segment(&constructor_args.vm)?; + + if let Some(balloon_state) = &state.balloon_device { + let device = Arc::new(Mutex::new( + Balloon::restore( + BalloonConstructorArgs { + mem: mem.clone(), + restored_from_file: constructor_args.restored_from_file, + }, + &balloon_state.device_state, + ) + .unwrap(), + )); + + constructor_args + .vm_resources + .update_from_restored_device(SharedDeviceType::Balloon(device.clone())) + .unwrap(); + + pci_devices + .restore_pci_device( + &constructor_args.vm, + device, + &balloon_state.device_id, + &balloon_state.transport_state, + constructor_args.event_manager, + ) + .unwrap() + } + + for block_state in &state.block_devices { + let device = Arc::new(Mutex::new( + Block::restore( + BlockConstructorArgs { mem: mem.clone() }, + &block_state.device_state, + ) + .unwrap(), + )); + + constructor_args + .vm_resources + .update_from_restored_device(SharedDeviceType::VirtioBlock(device.clone())) + .unwrap(); + + pci_devices + .restore_pci_device( + &constructor_args.vm, + device, + &block_state.device_id, + &block_state.transport_state, + constructor_args.event_manager, + ) + .unwrap() + } + + // Initialize MMDS if MMDS state is included. + if let Some(mmds) = &state.mmds { + constructor_args + .vm_resources + .set_mmds_basic_config(mmds.version, mmds.imds_compat, constructor_args.instance_id) + .unwrap(); + } else if state + .net_devices + .iter() + .any(|dev| dev.device_state.mmds_ns.is_some()) + { + // If there's at least one network device having an mmds_ns, it means + // that we are restoring from a version that did not persist the `MmdsVersionState`. + // Init with the default. + constructor_args.vm_resources.mmds_or_default()?; + } + + for net_state in &state.net_devices { + let device = Arc::new(Mutex::new( + Net::restore( + NetConstructorArgs { + mem: mem.clone(), + mmds: constructor_args + .vm_resources + .mmds + .as_ref() + // Clone the Arc reference. + .cloned(), + }, + &net_state.device_state, + ) + .unwrap(), + )); + + constructor_args + .vm_resources + .update_from_restored_device(SharedDeviceType::Network(device.clone())) + .unwrap(); + + pci_devices + .restore_pci_device( + &constructor_args.vm, + device, + &net_state.device_id, + &net_state.transport_state, + constructor_args.event_manager, + ) + .unwrap() + } + + if let Some(vsock_state) = &state.vsock_device { + let ctor_args = VsockUdsConstructorArgs { + cid: vsock_state.device_state.frontend.cid, + }; + let backend = + VsockUnixBackend::restore(ctor_args, &vsock_state.device_state.backend).unwrap(); + let device = Arc::new(Mutex::new( + Vsock::restore( + VsockConstructorArgs { + mem: mem.clone(), + backend, + }, + &vsock_state.device_state.frontend, + ) + .unwrap(), + )); + + constructor_args + .vm_resources + .update_from_restored_device(SharedDeviceType::Vsock(device.clone())) + .unwrap(); + + pci_devices + .restore_pci_device( + &constructor_args.vm, + device, + &vsock_state.device_id, + &vsock_state.transport_state, + constructor_args.event_manager, + ) + .unwrap() + } + + if let Some(entropy_state) = &state.entropy_device { + let ctor_args = EntropyConstructorArgs { mem: mem.clone() }; + + let device = Arc::new(Mutex::new( + Entropy::restore(ctor_args, &entropy_state.device_state).unwrap(), + )); + + constructor_args + .vm_resources + .update_from_restored_device(SharedDeviceType::Entropy(device.clone())) + .unwrap(); + + pci_devices + .restore_pci_device( + &constructor_args.vm, + device, + &entropy_state.device_id, + &entropy_state.transport_state, + constructor_args.event_manager, + ) + .unwrap() + } + + Ok(pci_devices) + } +} + +#[cfg(test)] +mod tests { + use vmm_sys_util::tempfile::TempFile; + + use super::*; + use crate::builder::tests::*; + use crate::device_manager; + use crate::devices::virtio::block::CacheType; + use crate::mmds::data_store::MmdsVersion; + use crate::resources::VmmConfig; + use crate::snapshot::Snapshot; + use crate::vmm_config::balloon::BalloonDeviceConfig; + use crate::vmm_config::entropy::EntropyDeviceConfig; + use crate::vmm_config::net::NetworkInterfaceConfig; + use crate::vmm_config::vsock::VsockDeviceConfig; + + #[test] + fn test_device_manager_persistence() { + let mut buf = vec![0; 65536]; + // These need to survive so the restored blocks find them. + let _block_files; + let mut tmp_sock_file = TempFile::new().unwrap(); + tmp_sock_file.remove().unwrap(); + // Set up a vmm with one of each device, and get the serialized DeviceStates. + { + let mut event_manager = EventManager::new().expect("Unable to create EventManager"); + let mut vmm = default_vmm(); + vmm.device_manager.enable_pci(&vmm.vm).unwrap(); + let mut cmdline = default_kernel_cmdline(); + + // Add a balloon device. + let balloon_cfg = BalloonDeviceConfig { + amount_mib: 123, + deflate_on_oom: false, + stats_polling_interval_s: 1, + }; + insert_balloon_device(&mut vmm, &mut cmdline, &mut event_manager, balloon_cfg); + // Add a block device. + let drive_id = String::from("root"); + let block_configs = vec![CustomBlockConfig::new( + drive_id, + true, + None, + true, + CacheType::Unsafe, + )]; + _block_files = + insert_block_devices(&mut vmm, &mut cmdline, &mut event_manager, block_configs); + // Add a net device. + let network_interface = NetworkInterfaceConfig { + iface_id: String::from("netif"), + host_dev_name: String::from("hostname"), + guest_mac: None, + rx_rate_limiter: None, + tx_rate_limiter: None, + }; + insert_net_device_with_mmds( + &mut vmm, + &mut cmdline, + &mut event_manager, + network_interface, + MmdsVersion::V2, + ); + // Add a vsock device. + let vsock_dev_id = "vsock"; + let vsock_config = VsockDeviceConfig { + vsock_id: Some(vsock_dev_id.to_string()), + guest_cid: 3, + uds_path: tmp_sock_file.as_path().to_str().unwrap().to_string(), + }; + insert_vsock_device(&mut vmm, &mut cmdline, &mut event_manager, vsock_config); + // Add an entropy device. + let entropy_config = EntropyDeviceConfig::default(); + insert_entropy_device(&mut vmm, &mut cmdline, &mut event_manager, entropy_config); + + Snapshot::serialize(&mut buf.as_mut_slice(), &vmm.device_manager.save()).unwrap(); + } + + tmp_sock_file.remove().unwrap(); + + let mut event_manager = EventManager::new().expect("Unable to create EventManager"); + // Keep in mind we are re-creating here an empty DeviceManager. Restoring later on + // will create a new PciDevices manager different than vmm.pci_devices. We're doing + // this to avoid restoring the whole Vmm, since what we really need from Vmm is the Vm + // object and calling default_vmm() is the easiest way to create one. + let vmm = default_vmm(); + let device_manager_state: device_manager::DevicesState = + Snapshot::deserialize(&mut buf.as_slice()).unwrap(); + let vm_resources = &mut VmResources::default(); + let restore_args = PciDevicesConstructorArgs { + vm: vmm.vm.clone(), + mem: vmm.vm.guest_memory(), + vm_resources, + instance_id: "microvm-id", + restored_from_file: true, + event_manager: &mut event_manager, + }; + let _restored_dev_manager = + PciDevices::restore(restore_args, &device_manager_state.pci_state).unwrap(); + + let expected_vm_resources = format!( + r#"{{ + "balloon": {{ + "amount_mib": 123, + "deflate_on_oom": false, + "stats_polling_interval_s": 1 + }}, + "drives": [ + {{ + "drive_id": "root", + "partuuid": null, + "is_root_device": true, + "cache_type": "Unsafe", + "is_read_only": true, + "path_on_host": "{}", + "rate_limiter": null, + "io_engine": "Sync", + "socket": null + }} + ], + "boot-source": {{ + "kernel_image_path": "", + "initrd_path": null, + "boot_args": null + }}, + "cpu-config": null, + "logger": null, + "machine-config": {{ + "vcpu_count": 1, + "mem_size_mib": 128, + "smt": false, + "track_dirty_pages": false, + "huge_pages": "None" + }}, + "metrics": null, + "mmds-config": {{ + "version": "V2", + "network_interfaces": [ + "netif" + ], + "ipv4_address": "169.254.169.254", + "imds_compat": false + }}, + "network-interfaces": [ + {{ + "iface_id": "netif", + "host_dev_name": "hostname", + "guest_mac": null, + "rx_rate_limiter": null, + "tx_rate_limiter": null + }} + ], + "vsock": {{ + "guest_cid": 3, + "uds_path": "{}" + }}, + "entropy": {{ + "rate_limiter": null + }} +}}"#, + _block_files.last().unwrap().as_path().to_str().unwrap(), + tmp_sock_file.as_path().to_str().unwrap() + ); + + assert_eq!( + vm_resources + .mmds + .as_ref() + .unwrap() + .lock() + .unwrap() + .version(), + MmdsVersion::V2 + ); + assert_eq!( + device_manager_state.pci_state.mmds.unwrap().version, + MmdsVersion::V2 + ); + assert_eq!( + expected_vm_resources, + serde_json::to_string_pretty(&VmmConfig::from(&*vm_resources)).unwrap() + ); + } +} diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index fd24db52c3b..74e71f3a6bf 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -7,25 +7,24 @@ use std::fmt::{self, Debug}; use std::sync::{Arc, Mutex}; use event_manager::{MutEventSubscriber, SubscriberOps}; -use kvm_ioctls::VmFd; use log::{error, warn}; use serde::{Deserialize, Serialize}; -use vm_allocator::AllocPolicy; use super::acpi::ACPIDeviceManager; use super::mmio::*; -use super::resources::ResourceAllocator; -use crate::EventManager; #[cfg(target_arch = "aarch64")] use crate::arch::DeviceType; use crate::devices::acpi::vmgenid::{VMGenIDState, VMGenIdConstructorArgs, VmGenId, VmGenIdError}; +#[cfg(target_arch = "aarch64")] +use crate::devices::legacy::serial::SerialOut; +#[cfg(target_arch = "aarch64")] +use crate::devices::legacy::{RTCDevice, SerialDevice}; use crate::devices::virtio::balloon::persist::{BalloonConstructorArgs, BalloonState}; use crate::devices::virtio::balloon::{Balloon, BalloonError}; use crate::devices::virtio::block::BlockError; use crate::devices::virtio::block::device::Block; use crate::devices::virtio::block::persist::{BlockConstructorArgs, BlockState}; use crate::devices::virtio::device::VirtioDevice; -use crate::devices::virtio::mmio::MmioTransport; use crate::devices::virtio::net::Net; use crate::devices::virtio::net::persist::{ NetConstructorArgs, NetPersistError as NetError, NetState, @@ -35,18 +34,20 @@ use crate::devices::virtio::rng::Entropy; use crate::devices::virtio::rng::persist::{ EntropyConstructorArgs, EntropyPersistError as EntropyError, EntropyState, }; +use crate::devices::virtio::transport::mmio::{IrqTrigger, MmioTransport}; use crate::devices::virtio::vsock::persist::{ VsockConstructorArgs, VsockState, VsockUdsConstructorArgs, }; use crate::devices::virtio::vsock::{ TYPE_VSOCK, Vsock, VsockError, VsockUnixBackend, VsockUnixBackendError, }; -use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET, TYPE_RNG}; +use crate::devices::virtio::{ActivateError, TYPE_BALLOON, TYPE_BLOCK, TYPE_NET, TYPE_RNG}; use crate::mmds::data_store::MmdsVersion; use crate::resources::{ResourcesError, VmResources}; use crate::snapshot::Persist; use crate::vmm_config::mmds::MmdsConfigError; use crate::vstate::memory::GuestMemoryMmap; +use crate::{EventManager, Vm}; /// Errors for (de)serialization of the MMIO device manager. #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -59,9 +60,11 @@ pub enum DevicePersistError { DeviceManager(#[from] super::mmio::MmioError), /// Mmio transport MmioTransport, + /// Bus error: {0} + Bus(#[from] vm_device::BusError), #[cfg(target_arch = "aarch64")] /// Legacy: {0} - Legacy(#[from] crate::VmmError), + Legacy(#[from] std::io::Error), /// Net: {0} Net(#[from] NetError), /// Vsock: {0} @@ -74,67 +77,17 @@ pub enum DevicePersistError { Entropy(#[from] EntropyError), /// Resource misconfiguration: {0}. Is the snapshot file corrupted? ResourcesError(#[from] ResourcesError), + /// Could not activate device: {0} + DeviceActivation(#[from] ActivateError), } -/// Holds the state of a balloon device connected to the MMIO space. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ConnectedBalloonState { - /// Device identifier. - pub device_id: String, - /// Device state. - pub device_state: BalloonState, - /// Mmio transport state. - pub transport_state: MmioTransportState, - /// VmmResources. - pub device_info: MMIODeviceInfo, -} - -/// Holds the state of a virtio block device connected to the MMIO space. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ConnectedBlockState { - /// Device identifier. - pub device_id: String, - /// Device state. - pub device_state: BlockState, - /// Mmio transport state. - pub transport_state: MmioTransportState, - /// VmmResources. - pub device_info: MMIODeviceInfo, -} - -/// Holds the state of a net device connected to the MMIO space. +/// Holds the state of a MMIO VirtIO device #[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ConnectedNetState { +pub struct VirtioDeviceState { /// Device identifier. pub device_id: String, /// Device state. - pub device_state: NetState, - /// Mmio transport state. - pub transport_state: MmioTransportState, - /// VmmResources. - pub device_info: MMIODeviceInfo, -} - -/// Holds the state of a vsock device connected to the MMIO space. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ConnectedVsockState { - /// Device identifier. - pub device_id: String, - /// Device state. - pub device_state: VsockState, - /// Mmio transport state. - pub transport_state: MmioTransportState, - /// VmmResources. - pub device_info: MMIODeviceInfo, -} - -/// Holds the state of an entropy device connected to the MMIO space. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ConnectedEntropyState { - /// Device identifier. - pub device_id: String, - /// Device state. - pub device_state: EntropyState, + pub device_state: T, /// Mmio transport state. pub transport_state: MmioTransportState, /// VmmResources. @@ -153,8 +106,8 @@ pub struct ConnectedLegacyState { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct MmdsState { - version: MmdsVersion, - imds_compat: bool, + pub version: MmdsVersion, + pub imds_compat: bool, } /// Holds the device states. @@ -164,17 +117,17 @@ pub struct DeviceStates { // State of legacy devices in MMIO space. pub legacy_devices: Vec, /// Block device states. - pub block_devices: Vec, + pub block_devices: Vec>, /// Net device states. - pub net_devices: Vec, + pub net_devices: Vec>, /// Vsock device state. - pub vsock_device: Option, + pub vsock_device: Option>, /// Balloon device state. - pub balloon_device: Option, + pub balloon_device: Option>, /// Mmds version. pub mmds: Option, /// Entropy device state. - pub entropy_device: Option, + pub entropy_device: Option>, } /// A type used to extract the concrete `Arc>` for each of the device @@ -190,9 +143,8 @@ pub enum SharedDeviceType { pub struct MMIODevManagerConstructorArgs<'a> { pub mem: &'a GuestMemoryMmap, - pub vm: &'a VmFd, + pub vm: &'a Vm, pub event_manager: &'a mut EventManager, - pub resource_allocator: &'a mut ResourceAllocator, pub vm_resources: &'a mut VmResources, pub instance_id: &'a str, pub restored_from_file: bool, @@ -215,10 +167,10 @@ pub struct ACPIDeviceManagerState { vmgenid: Option, } +#[derive(Debug)] pub struct ACPIDeviceManagerConstructorArgs<'a> { pub mem: &'a GuestMemoryMmap, - pub resource_allocator: &'a mut ResourceAllocator, - pub vm: &'a VmFd, + pub vm: &'a Vm, } #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -249,7 +201,7 @@ impl<'a> Persist<'a> for ACPIDeviceManager { let vmgenid = VmGenId::restore( VMGenIdConstructorArgs { mem: constructor_args.mem, - resource_allocator: constructor_args.resource_allocator, + resource_allocator: &mut constructor_args.vm.resource_allocator(), }, vmgenid_args, )?; @@ -266,44 +218,43 @@ impl<'a> Persist<'a> for MMIODeviceManager { fn save(&self) -> Self::State { let mut states = DeviceStates::default(); - let _: Result<(), ()> = self.for_each_device(|devtype, devid, device_info, bus_dev| { - if *devtype == crate::arch::DeviceType::BootTimer { - // No need to save BootTimer state. - return Ok(()); - } - #[cfg(target_arch = "aarch64")] - { - if *devtype == DeviceType::Serial || *devtype == DeviceType::Rtc { - states.legacy_devices.push(ConnectedLegacyState { - type_: *devtype, - device_info: device_info.clone(), - }); - return Ok(()); - } + #[cfg(target_arch = "aarch64")] + { + if let Some(device) = &self.serial { + states.legacy_devices.push(ConnectedLegacyState { + type_: DeviceType::Serial, + device_info: device.resources, + }); } - let locked_bus_dev = bus_dev.lock().expect("Poisoned lock"); - - let mmio_transport = locked_bus_dev - .mmio_transport_ref() - .expect("Unexpected device type"); + if let Some(device) = &self.rtc { + states.legacy_devices.push(ConnectedLegacyState { + type_: DeviceType::Rtc, + device_info: device.resources, + }); + } + } - let transport_state = mmio_transport.save(); + let _: Result<(), ()> = self.for_each_virtio_device(|_, devid, device| { + let mmio_transport_locked = device.inner.lock().expect("Poisoned lock"); + let transport_state = mmio_transport_locked.save(); + let device_info = device.resources; + let device_id = devid.clone(); - let mut locked_device = mmio_transport.locked_device(); + let mut locked_device = mmio_transport_locked.locked_device(); match locked_device.device_type() { TYPE_BALLOON => { - let balloon_state = locked_device + let device_state = locked_device .as_any() .downcast_ref::() .unwrap() .save(); - states.balloon_device = Some(ConnectedBalloonState { - device_id: devid.clone(), - device_state: balloon_state, + states.balloon_device = Some(VirtioDeviceState { + device_id, + device_state, transport_state, - device_info: device_info.clone(), + device_info, }); } // Both virtio-block and vhost-user-block share same device type. @@ -316,16 +267,17 @@ impl<'a> Persist<'a> for MMIODeviceManager { ); } else { block.prepare_save(); - states.block_devices.push(ConnectedBlockState { - device_id: devid.clone(), - device_state: block.save(), + let device_state = block.save(); + states.block_devices.push(VirtioDeviceState { + device_id, + device_state, transport_state, - device_info: device_info.clone(), - }) + device_info, + }); } } TYPE_NET => { - let net = locked_device.as_any().downcast_ref::().unwrap(); + let net = locked_device.as_mut_any().downcast_mut::().unwrap(); if let (Some(mmds_ns), None) = (net.mmds_ns.as_ref(), states.mmds.as_ref()) { let mmds_guard = mmds_ns.mmds.lock().expect("Poisoned lock"); states.mmds = Some(MmdsState { @@ -334,11 +286,13 @@ impl<'a> Persist<'a> for MMIODeviceManager { }); } - states.net_devices.push(ConnectedNetState { - device_id: devid.clone(), - device_state: net.save(), + net.prepare_save(); + let device_state = net.save(); + states.net_devices.push(VirtioDeviceState { + device_id, + device_state, transport_state, - device_info: device_info.clone(), + device_info, }); } TYPE_VSOCK => { @@ -358,16 +312,16 @@ impl<'a> Persist<'a> for MMIODeviceManager { // Save state after potential notification to the guest. This // way we save changes to the queue the notification can cause. - let vsock_state = VsockState { + let device_state = VsockState { backend: vsock.backend().save(), frontend: vsock.save(), }; - states.vsock_device = Some(ConnectedVsockState { - device_id: devid.clone(), - device_state: vsock_state, + states.vsock_device = Some(VirtioDeviceState { + device_id, + device_state, transport_state, - device_info: device_info.clone(), + device_info, }); } TYPE_RNG => { @@ -375,12 +329,13 @@ impl<'a> Persist<'a> for MMIODeviceManager { .as_mut_any() .downcast_mut::() .unwrap(); + let device_state = entropy.save(); - states.entropy_device = Some(ConnectedEntropyState { - device_id: devid.clone(), - device_state: entropy.save(), + states.entropy_device = Some(VirtioDeviceState { + device_id, + device_state, transport_state, - device_info: device_info.clone(), + device_info, }); } _ => unreachable!(), @@ -403,54 +358,29 @@ impl<'a> Persist<'a> for MMIODeviceManager { { for state in &state.legacy_devices { if state.type_ == DeviceType::Serial { - let serial = crate::builder::setup_serial_device( - constructor_args.event_manager, - std::io::stdin(), - std::io::stdout(), - )?; - + let serial = Arc::new(Mutex::new(SerialDevice::new( + Some(std::io::stdin()), + SerialOut::Stdout(std::io::stdout()), + )?)); constructor_args - .resource_allocator - .allocate_mmio_memory( - MMIO_LEN, - MMIO_LEN, - AllocPolicy::ExactMatch(state.device_info.addr), - ) - .map_err(|e| { - DevicePersistError::DeviceManager(super::mmio::MmioError::Allocator(e)) - })?; - - dev_manager.register_mmio_serial( - vm, - constructor_args.resource_allocator, - serial, - Some(state.device_info.clone()), - )?; + .event_manager + .add_subscriber(serial.clone()); + + dev_manager.register_mmio_serial(vm, serial, Some(state.device_info))?; } if state.type_ == DeviceType::Rtc { - let rtc = crate::devices::legacy::RTCDevice(vm_superio::Rtc::with_events( - &crate::devices::legacy::rtc_pl031::METRICS, - )); - constructor_args - .resource_allocator - .allocate_mmio_memory( - MMIO_LEN, - MMIO_LEN, - AllocPolicy::ExactMatch(state.device_info.addr), - ) - .map_err(|e| { - DevicePersistError::DeviceManager(super::mmio::MmioError::Allocator(e)) - })?; + let rtc = Arc::new(Mutex::new(RTCDevice::new())); dev_manager.register_mmio_rtc( - constructor_args.resource_allocator, + constructor_args.vm, rtc, - Some(state.device_info.clone()), + Some(state.device_info), )?; } } } let mut restore_helper = |device: Arc>, + activated: bool, is_vhost_user: bool, as_subscriber: Arc>, id: &String, @@ -458,36 +388,33 @@ impl<'a> Persist<'a> for MMIODeviceManager { device_info: &MMIODeviceInfo, event_manager: &mut EventManager| -> Result<(), Self::Error> { + let interrupt = Arc::new(IrqTrigger::new()); let restore_args = MmioTransportConstructorArgs { mem: mem.clone(), - device, + interrupt: interrupt.clone(), + device: device.clone(), is_vhost_user, }; - let mmio_transport = MmioTransport::restore(restore_args, state) - .map_err(|()| DevicePersistError::MmioTransport)?; - - // We do not currently require exact re-allocation of IDs via - // `dev_manager.irq_allocator.allocate_id()` and currently cannot do - // this effectively as `IdAllocator` does not implement an exact - // match API. - // In the future we may require preserving `IdAllocator`'s state - // after snapshot restore so as to restore the exact interrupt IDs - // from the original device's state for implementing hot-plug. - // For now this is why we do not restore the state of the - // `IdAllocator` under `dev_manager`. + let mmio_transport = Arc::new(Mutex::new( + MmioTransport::restore(restore_args, state) + .map_err(|()| DevicePersistError::MmioTransport)?, + )); + + dev_manager.register_mmio_virtio( + vm, + id.clone(), + MMIODevice { + resources: *device_info, + inner: mmio_transport, + }, + )?; - constructor_args - .resource_allocator - .allocate_mmio_memory( - MMIO_LEN, - MMIO_LEN, - AllocPolicy::ExactMatch(device_info.addr), - ) - .map_err(|e| { - DevicePersistError::DeviceManager(super::mmio::MmioError::Allocator(e)) - })?; - - dev_manager.register_mmio_virtio(vm, id.clone(), mmio_transport, device_info)?; + if activated { + device + .lock() + .expect("Poisoned lock") + .activate(mem.clone(), interrupt)?; + } event_manager.add_subscriber(as_subscriber); Ok(()) @@ -508,6 +435,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { restore_helper( device.clone(), + balloon_state.device_state.virtio_state.activated, false, device, &balloon_state.device_id, @@ -529,6 +457,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { restore_helper( device.clone(), + block_state.device_state.is_activated(), false, device, &block_state.device_id, @@ -567,6 +496,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { restore_helper( device.clone(), + net_state.device_state.virtio_state.activated, false, device, &net_state.device_id, @@ -595,6 +525,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { restore_helper( device.clone(), + vsock_state.device_state.frontend.virtio_state.activated, false, device, &vsock_state.device_id, @@ -605,7 +536,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { } if let Some(entropy_state) = &state.entropy_device { - let ctor_args = EntropyConstructorArgs::new(mem.clone()); + let ctor_args = EntropyConstructorArgs { mem: mem.clone() }; let device = Arc::new(Mutex::new(Entropy::restore( ctor_args, @@ -618,6 +549,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { restore_helper( device.clone(), + entropy_state.device_state.virtio_state.activated, false, device, &entropy_state.device_id, @@ -637,6 +569,7 @@ mod tests { use super::*; use crate::builder::tests::*; + use crate::device_manager; use crate::devices::virtio::block::CacheType; use crate::resources::VmmConfig; use crate::snapshot::Snapshot; @@ -645,29 +578,8 @@ mod tests { use crate::vmm_config::net::NetworkInterfaceConfig; use crate::vmm_config::vsock::VsockDeviceConfig; - impl PartialEq for ConnectedBalloonState { - fn eq(&self, other: &ConnectedBalloonState) -> bool { - // Actual device state equality is checked by the device's tests. - self.transport_state == other.transport_state && self.device_info == other.device_info - } - } - - impl PartialEq for ConnectedBlockState { - fn eq(&self, other: &ConnectedBlockState) -> bool { - // Actual device state equality is checked by the device's tests. - self.transport_state == other.transport_state && self.device_info == other.device_info - } - } - - impl PartialEq for ConnectedNetState { - fn eq(&self, other: &ConnectedNetState) -> bool { - // Actual device state equality is checked by the device's tests. - self.transport_state == other.transport_state && self.device_info == other.device_info - } - } - - impl PartialEq for ConnectedVsockState { - fn eq(&self, other: &ConnectedVsockState) -> bool { + impl PartialEq for VirtioDeviceState { + fn eq(&self, other: &VirtioDeviceState) -> bool { // Actual device state equality is checked by the device's tests. self.transport_state == other.transport_state && self.device_info == other.device_info } @@ -679,46 +591,42 @@ mod tests { && self.block_devices == other.block_devices && self.net_devices == other.net_devices && self.vsock_device == other.vsock_device + && self.entropy_device == other.entropy_device } } - impl MMIODeviceManager { - fn soft_clone(&self) -> Self { - // We can unwrap here as we create with values directly in scope we - // know will results in `Ok` - let mut clone = MMIODeviceManager::new(); - // We only care about the device hashmap. - clone.id_to_dev_info.clone_from(&self.id_to_dev_info); - clone + impl PartialEq for MMIODevice { + fn eq(&self, other: &Self) -> bool { + self.resources == other.resources } } impl PartialEq for MMIODeviceManager { fn eq(&self, other: &MMIODeviceManager) -> bool { // We only care about the device hashmap. - if self.id_to_dev_info.len() != other.id_to_dev_info.len() { + if self.virtio_devices.len() != other.virtio_devices.len() { return false; } - for (key, val) in &self.id_to_dev_info { - match other.id_to_dev_info.get(key) { + for (key, val) in &self.virtio_devices { + match other.virtio_devices.get(key) { Some(other_val) if val == other_val => continue, _ => return false, - }; + } } - true + + self.boot_timer == other.boot_timer } } #[test] fn test_device_manager_persistence() { - let mut buf = vec![0; 16384]; + let mut buf = vec![0; 65536]; // These need to survive so the restored blocks find them. let _block_files; let mut tmp_sock_file = TempFile::new().unwrap(); - let mut resource_allocator = ResourceAllocator::new().unwrap(); tmp_sock_file.remove().unwrap(); // Set up a vmm with one of each device, and get the serialized DeviceStates. - let original_mmio_device_manager = { + { let mut event_manager = EventManager::new().expect("Unable to create EventManager"); let mut vmm = default_vmm(); let mut cmdline = default_kernel_cmdline(); @@ -768,28 +676,26 @@ mod tests { let entropy_config = EntropyDeviceConfig::default(); insert_entropy_device(&mut vmm, &mut cmdline, &mut event_manager, entropy_config); - Snapshot::serialize(&mut buf.as_mut_slice(), &vmm.mmio_device_manager.save()).unwrap(); + Snapshot::serialize(&mut buf.as_mut_slice(), &vmm.device_manager.save()).unwrap(); + } - // We only want to keep the device map from the original MmioDeviceManager. - vmm.mmio_device_manager.soft_clone() - }; tmp_sock_file.remove().unwrap(); let mut event_manager = EventManager::new().expect("Unable to create EventManager"); let vmm = default_vmm(); - let device_states: DeviceStates = Snapshot::deserialize(&mut buf.as_slice()).unwrap(); + let device_manager_state: device_manager::DevicesState = + Snapshot::deserialize(&mut buf.as_slice()).unwrap(); let vm_resources = &mut VmResources::default(); let restore_args = MMIODevManagerConstructorArgs { mem: vmm.vm.guest_memory(), - vm: vmm.vm.fd(), + vm: &vmm.vm, event_manager: &mut event_manager, - resource_allocator: &mut resource_allocator, vm_resources, instance_id: "microvm-id", restored_from_file: true, }; - let restored_dev_manager = - MMIODeviceManager::restore(restore_args, &device_states).unwrap(); + let _restored_dev_manager = + MMIODeviceManager::restore(restore_args, &device_manager_state.mmio_state).unwrap(); let expected_vm_resources = format!( r#"{{ @@ -865,9 +771,10 @@ mod tests { .version(), MmdsVersion::V2 ); - assert_eq!(device_states.mmds.unwrap().version, MmdsVersion::V2); - - assert_eq!(restored_dev_manager, original_mmio_device_manager); + assert_eq!( + device_manager_state.mmio_state.mmds.unwrap().version, + MmdsVersion::V2 + ); assert_eq!( expected_vm_resources, serde_json::to_string_pretty(&VmmConfig::from(&*vm_resources)).unwrap() diff --git a/src/vmm/src/device_manager/resources.rs b/src/vmm/src/device_manager/resources.rs deleted file mode 100644 index 719426a1f55..00000000000 --- a/src/vmm/src/device_manager/resources.rs +++ /dev/null @@ -1,145 +0,0 @@ -// Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -pub use vm_allocator::AllocPolicy; -use vm_allocator::{AddressAllocator, IdAllocator}; - -use crate::arch; - -/// A resource manager for (de)allocating interrupt lines (GSIs) and guest memory -/// -/// At the moment, we support: -/// -/// * GSIs for legacy x86_64 devices -/// * GSIs for MMIO devicecs -/// * Memory allocations in the MMIO address space -#[derive(Debug)] -pub struct ResourceAllocator { - // Allocator for device interrupt lines - gsi_allocator: IdAllocator, - // Allocator for memory in the MMIO address space - mmio_memory: AddressAllocator, - // Memory allocator for system data - system_memory: AddressAllocator, -} - -impl ResourceAllocator { - /// Create a new resource allocator for Firecracker devices - pub fn new() -> Result { - Ok(Self { - gsi_allocator: IdAllocator::new(arch::GSI_BASE, arch::GSI_MAX)?, - mmio_memory: AddressAllocator::new(arch::MMIO_MEM_START, arch::MMIO_MEM_SIZE)?, - system_memory: AddressAllocator::new(arch::SYSTEM_MEM_START, arch::SYSTEM_MEM_SIZE)?, - }) - } - - /// Allocate a number of GSIs - /// - /// # Arguments - /// - /// * `gsi_count` - The number of GSIs to allocate - pub fn allocate_gsi(&mut self, gsi_count: u32) -> Result, vm_allocator::Error> { - let mut gsis = Vec::with_capacity(gsi_count as usize); - - for _ in 0..gsi_count { - match self.gsi_allocator.allocate_id() { - Ok(gsi) => gsis.push(gsi), - Err(err) => { - // It is ok to unwrap here, we just allocated the GSI - gsis.into_iter().for_each(|gsi| { - self.gsi_allocator.free_id(gsi).unwrap(); - }); - return Err(err); - } - } - } - - Ok(gsis) - } - - /// Allocate a memory range in MMIO address space - /// - /// If it succeeds, it returns the first address of the allocated range - /// - /// # Arguments - /// - /// * `size` - The size in bytes of the memory to allocate - /// * `alignment` - The alignment of the address of the first byte - /// * `policy` - A [`vm_allocator::AllocPolicy`] variant for determining the allocation policy - pub fn allocate_mmio_memory( - &mut self, - size: u64, - alignment: u64, - policy: AllocPolicy, - ) -> Result { - Ok(self.mmio_memory.allocate(size, alignment, policy)?.start()) - } - - /// Allocate a memory range for system data - /// - /// If it succeeds, it returns the first address of the allocated range - /// - /// # Arguments - /// - /// * `size` - The size in bytes of the memory to allocate - /// * `alignment` - The alignment of the address of the first byte - /// * `policy` - A [`vm_allocator::AllocPolicy`] variant for determining the allocation policy - pub fn allocate_system_memory( - &mut self, - size: u64, - alignment: u64, - policy: AllocPolicy, - ) -> Result { - Ok(self - .system_memory - .allocate(size, alignment, policy)? - .start()) - } -} - -#[cfg(test)] -mod tests { - use super::ResourceAllocator; - use crate::arch; - - const MAX_IRQS: u32 = arch::GSI_MAX - arch::GSI_BASE + 1; - - #[test] - fn test_allocate_gsi() { - let mut allocator = ResourceAllocator::new().unwrap(); - // asking for 0 IRQs should return us an empty vector - assert_eq!(allocator.allocate_gsi(0), Ok(vec![])); - // We cannot allocate more GSIs than available - assert_eq!( - allocator.allocate_gsi(MAX_IRQS + 1), - Err(vm_allocator::Error::ResourceNotAvailable) - ); - // But allocating all of them at once should work - assert_eq!( - allocator.allocate_gsi(MAX_IRQS), - Ok((arch::GSI_BASE..=arch::GSI_MAX).collect::>()) - ); - // And now we ran out of GSIs - assert_eq!( - allocator.allocate_gsi(1), - Err(vm_allocator::Error::ResourceNotAvailable) - ); - // But we should be able to ask for 0 GSIs - assert_eq!(allocator.allocate_gsi(0), Ok(vec![])); - - let mut allocator = ResourceAllocator::new().unwrap(); - // We should be able to allocate 1 GSI - assert_eq!(allocator.allocate_gsi(1), Ok(vec![arch::GSI_BASE])); - // We can't allocate MAX_IRQS any more - assert_eq!( - allocator.allocate_gsi(MAX_IRQS), - Err(vm_allocator::Error::ResourceNotAvailable) - ); - // We can allocate another one and it should be the second available - assert_eq!(allocator.allocate_gsi(1), Ok(vec![arch::GSI_BASE + 1])); - // Let's allocate the rest in a loop - for i in arch::GSI_BASE + 2..=arch::GSI_MAX { - assert_eq!(allocator.allocate_gsi(1), Ok(vec![i])); - } - } -} diff --git a/src/vmm/src/devices/acpi/vmgenid.rs b/src/vmm/src/devices/acpi/vmgenid.rs index 31dbf64ec39..8dc89289c98 100644 --- a/src/vmm/src/devices/acpi/vmgenid.rs +++ b/src/vmm/src/devices/acpi/vmgenid.rs @@ -11,9 +11,9 @@ use vm_superio::Trigger; use vmm_sys_util::eventfd::EventFd; use super::super::legacy::EventFdTrigger; -use crate::device_manager::resources::ResourceAllocator; use crate::snapshot::Persist; use crate::vstate::memory::{Bytes, GuestMemoryMmap}; +use crate::vstate::resources::ResourceAllocator; /// Bytes of memory we allocate for VMGenID device pub const VMGENID_MEM_SIZE: u64 = 16; @@ -88,7 +88,7 @@ impl VmGenId { mem: &GuestMemoryMmap, resource_allocator: &mut ResourceAllocator, ) -> Result { - let gsi = resource_allocator.allocate_gsi(1)?; + let gsi = resource_allocator.allocate_gsi_legacy(1)?; // The generation ID needs to live in an 8-byte aligned buffer let addr = resource_allocator.allocate_system_memory( VMGENID_MEM_SIZE, @@ -152,11 +152,6 @@ impl<'a> Persist<'a> for VmGenId { constructor_args: Self::ConstructorArgs, state: &Self::State, ) -> std::result::Result { - constructor_args.resource_allocator.allocate_system_memory( - VMGENID_MEM_SIZE, - 8, - vm_allocator::AllocPolicy::ExactMatch(state.addr), - )?; Self::from_parts(GuestAddress(state.addr), state.gsi, constructor_args.mem) } } diff --git a/src/vmm/src/devices/bus.rs b/src/vmm/src/devices/bus.rs deleted file mode 100644 index 2b016d73083..00000000000 --- a/src/vmm/src/devices/bus.rs +++ /dev/null @@ -1,404 +0,0 @@ -// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 -// -// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the THIRD-PARTY file. - -//! Handles routing to devices in an address space. - -use std::cmp::{Ord, Ordering, PartialEq, PartialOrd}; -use std::collections::btree_map::BTreeMap; -use std::sync::{Arc, Mutex}; - -/// Errors triggered during bus operations. -#[derive(Debug, thiserror::Error, displaydoc::Display)] -pub enum BusError { - /// New device overlaps with an old device. - Overlap, -} - -#[derive(Debug, Copy, Clone)] -struct BusRange(u64, u64); - -impl Eq for BusRange {} - -impl PartialEq for BusRange { - fn eq(&self, other: &BusRange) -> bool { - self.0 == other.0 - } -} - -impl Ord for BusRange { - fn cmp(&self, other: &BusRange) -> Ordering { - self.0.cmp(&other.0) - } -} - -impl PartialOrd for BusRange { - fn partial_cmp(&self, other: &BusRange) -> Option { - Some(self.cmp(other)) - } -} - -/// A device container for routing reads and writes over some address space. -/// -/// This doesn't have any restrictions on what kind of device or address space this applies to. The -/// only restriction is that no two devices can overlap in this address space. -#[derive(Debug, Clone, Default)] -pub struct Bus { - devices: BTreeMap>>, -} - -use event_manager::{EventOps, Events, MutEventSubscriber}; - -#[cfg(target_arch = "aarch64")] -use super::legacy::RTCDevice; -use super::legacy::{I8042Device, SerialDevice}; -use super::pseudo::BootTimer; -use super::virtio::mmio::MmioTransport; - -#[derive(Debug)] -pub enum BusDevice { - I8042Device(I8042Device), - #[cfg(target_arch = "aarch64")] - RTCDevice(RTCDevice), - BootTimer(BootTimer), - MmioTransport(MmioTransport), - Serial(SerialDevice), - #[cfg(test)] - Dummy(DummyDevice), - #[cfg(test)] - Constant(ConstantDevice), -} - -#[cfg(test)] -#[derive(Debug)] -pub struct DummyDevice; - -#[cfg(test)] -impl DummyDevice { - pub fn bus_write(&mut self, _offset: u64, _data: &[u8]) {} - pub fn bus_read(&mut self, _offset: u64, _data: &[u8]) {} -} - -#[cfg(test)] -#[derive(Debug)] -pub struct ConstantDevice; - -#[cfg(test)] -impl ConstantDevice { - pub fn bus_read(&mut self, offset: u64, data: &mut [u8]) { - for (i, v) in data.iter_mut().enumerate() { - *v = ((offset + i as u64) & 0xff) as u8; - } - } - - fn bus_write(&mut self, offset: u64, data: &[u8]) { - for (i, v) in data.iter().enumerate() { - assert_eq!(*v, ((offset + i as u64) & 0xff) as u8) - } - } -} - -impl BusDevice { - pub fn i8042_device_ref(&self) -> Option<&I8042Device> { - match self { - Self::I8042Device(x) => Some(x), - _ => None, - } - } - #[cfg(target_arch = "aarch64")] - pub fn rtc_device_ref(&self) -> Option<&RTCDevice> { - match self { - Self::RTCDevice(x) => Some(x), - _ => None, - } - } - pub fn boot_timer_ref(&self) -> Option<&BootTimer> { - match self { - Self::BootTimer(x) => Some(x), - _ => None, - } - } - pub fn mmio_transport_ref(&self) -> Option<&MmioTransport> { - match self { - Self::MmioTransport(x) => Some(x), - _ => None, - } - } - pub fn serial_ref(&self) -> Option<&SerialDevice> { - match self { - Self::Serial(x) => Some(x), - _ => None, - } - } - - pub fn i8042_device_mut(&mut self) -> Option<&mut I8042Device> { - match self { - Self::I8042Device(x) => Some(x), - _ => None, - } - } - #[cfg(target_arch = "aarch64")] - pub fn rtc_device_mut(&mut self) -> Option<&mut RTCDevice> { - match self { - Self::RTCDevice(x) => Some(x), - _ => None, - } - } - pub fn boot_timer_mut(&mut self) -> Option<&mut BootTimer> { - match self { - Self::BootTimer(x) => Some(x), - _ => None, - } - } - pub fn mmio_transport_mut(&mut self) -> Option<&mut MmioTransport> { - match self { - Self::MmioTransport(x) => Some(x), - _ => None, - } - } - pub fn serial_mut(&mut self) -> Option<&mut SerialDevice> { - match self { - Self::Serial(x) => Some(x), - _ => None, - } - } - - pub fn read(&mut self, offset: u64, data: &mut [u8]) { - match self { - Self::I8042Device(x) => x.bus_read(offset, data), - #[cfg(target_arch = "aarch64")] - Self::RTCDevice(x) => x.bus_read(offset, data), - Self::BootTimer(x) => x.bus_read(offset, data), - Self::MmioTransport(x) => x.bus_read(offset, data), - Self::Serial(x) => x.bus_read(offset, data), - #[cfg(test)] - Self::Dummy(x) => x.bus_read(offset, data), - #[cfg(test)] - Self::Constant(x) => x.bus_read(offset, data), - } - } - - pub fn write(&mut self, offset: u64, data: &[u8]) { - match self { - Self::I8042Device(x) => x.bus_write(offset, data), - #[cfg(target_arch = "aarch64")] - Self::RTCDevice(x) => x.bus_write(offset, data), - Self::BootTimer(x) => x.bus_write(offset, data), - Self::MmioTransport(x) => x.bus_write(offset, data), - Self::Serial(x) => x.bus_write(offset, data), - #[cfg(test)] - Self::Dummy(x) => x.bus_write(offset, data), - #[cfg(test)] - Self::Constant(x) => x.bus_write(offset, data), - } - } -} - -impl MutEventSubscriber for BusDevice { - fn process(&mut self, event: Events, ops: &mut EventOps) { - match self { - Self::Serial(serial) => serial.process(event, ops), - _ => panic!(), - } - } - fn init(&mut self, ops: &mut EventOps) { - match self { - Self::Serial(serial) => serial.init(ops), - _ => panic!(), - } - } -} - -impl Bus { - /// Constructs an a bus with an empty address space. - pub fn new() -> Bus { - Bus { - devices: BTreeMap::new(), - } - } - - fn first_before(&self, addr: u64) -> Option<(BusRange, &Mutex)> { - // for when we switch to rustc 1.17: self.devices.range(..addr).iter().rev().next() - for (range, dev) in self.devices.iter().rev() { - if range.0 <= addr { - return Some((*range, dev)); - } - } - None - } - - /// Returns the device found at some address. - pub fn get_device(&self, addr: u64) -> Option<(u64, &Mutex)> { - if let Some((BusRange(start, len), dev)) = self.first_before(addr) { - let offset = addr - start; - if offset < len { - return Some((offset, dev)); - } - } - None - } - - /// Puts the given device at the given address space. - pub fn insert( - &mut self, - device: Arc>, - base: u64, - len: u64, - ) -> Result<(), BusError> { - if len == 0 { - return Err(BusError::Overlap); - } - - // Reject all cases where the new device's base is within an old device's range. - if self.get_device(base).is_some() { - return Err(BusError::Overlap); - } - - // The above check will miss an overlap in which the new device's base address is before the - // range of another device. To catch that case, we search for a device with a range before - // the new device's range's end. If there is no existing device in that range that starts - // after the new device, then there will be no overlap. - if let Some((BusRange(start, _), _)) = self.first_before(base + len - 1) { - // Such a device only conflicts with the new device if it also starts after the new - // device because of our initial `get_device` check above. - if start >= base { - return Err(BusError::Overlap); - } - } - - if self.devices.insert(BusRange(base, len), device).is_some() { - return Err(BusError::Overlap); - } - - Ok(()) - } - - /// Reads data from the device that owns the range containing `addr` and puts it into `data`. - /// - /// Returns true on success, otherwise `data` is untouched. - pub fn read(&self, addr: u64, data: &mut [u8]) -> bool { - if let Some((offset, dev)) = self.get_device(addr) { - // OK to unwrap as lock() failing is a serious error condition and should panic. - dev.lock() - .expect("Failed to acquire device lock") - .read(offset, data); - true - } else { - false - } - } - - /// Writes `data` to the device that owns the range containing `addr`. - /// - /// Returns true on success, otherwise `data` is untouched. - pub fn write(&self, addr: u64, data: &[u8]) -> bool { - if let Some((offset, dev)) = self.get_device(addr) { - // OK to unwrap as lock() failing is a serious error condition and should panic. - dev.lock() - .expect("Failed to acquire device lock") - .write(offset, data); - true - } else { - false - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn bus_insert() { - let mut bus = Bus::new(); - let dummy = Arc::new(Mutex::new(BusDevice::Dummy(DummyDevice))); - // Insert len should not be 0. - bus.insert(dummy.clone(), 0x10, 0).unwrap_err(); - bus.insert(dummy.clone(), 0x10, 0x10).unwrap(); - - let result = bus.insert(dummy.clone(), 0x0f, 0x10); - // This overlaps the address space of the existing bus device at 0x10. - assert!(matches!(result, Err(BusError::Overlap)), "{:?}", result); - - // This overlaps the address space of the existing bus device at 0x10. - bus.insert(dummy.clone(), 0x10, 0x10).unwrap_err(); - // This overlaps the address space of the existing bus device at 0x10. - bus.insert(dummy.clone(), 0x10, 0x15).unwrap_err(); - // This overlaps the address space of the existing bus device at 0x10. - bus.insert(dummy.clone(), 0x12, 0x15).unwrap_err(); - // This overlaps the address space of the existing bus device at 0x10. - bus.insert(dummy.clone(), 0x12, 0x01).unwrap_err(); - // This overlaps the address space of the existing bus device at 0x10. - bus.insert(dummy.clone(), 0x0, 0x20).unwrap_err(); - bus.insert(dummy.clone(), 0x20, 0x05).unwrap(); - bus.insert(dummy.clone(), 0x25, 0x05).unwrap(); - bus.insert(dummy, 0x0, 0x10).unwrap(); - } - - #[test] - fn bus_read_write() { - let mut bus = Bus::new(); - let dummy = Arc::new(Mutex::new(BusDevice::Dummy(DummyDevice))); - bus.insert(dummy, 0x10, 0x10).unwrap(); - assert!(bus.read(0x10, &mut [0, 0, 0, 0])); - assert!(bus.write(0x10, &[0, 0, 0, 0])); - assert!(bus.read(0x11, &mut [0, 0, 0, 0])); - assert!(bus.write(0x11, &[0, 0, 0, 0])); - assert!(bus.read(0x16, &mut [0, 0, 0, 0])); - assert!(bus.write(0x16, &[0, 0, 0, 0])); - assert!(!bus.read(0x20, &mut [0, 0, 0, 0])); - assert!(!bus.write(0x20, &[0, 0, 0, 0])); - assert!(!bus.read(0x06, &mut [0, 0, 0, 0])); - assert!(!bus.write(0x06, &[0, 0, 0, 0])); - } - - #[test] - fn bus_read_write_values() { - let mut bus = Bus::new(); - let dummy = Arc::new(Mutex::new(BusDevice::Constant(ConstantDevice))); - bus.insert(dummy, 0x10, 0x10).unwrap(); - - let mut values = [0, 1, 2, 3]; - assert!(bus.read(0x10, &mut values)); - assert_eq!(values, [0, 1, 2, 3]); - assert!(bus.write(0x10, &values)); - assert!(bus.read(0x15, &mut values)); - assert_eq!(values, [5, 6, 7, 8]); - assert!(bus.write(0x15, &values)); - } - - #[test] - fn busrange_cmp_and_clone() { - assert_eq!(BusRange(0x10, 2), BusRange(0x10, 3)); - assert_eq!(BusRange(0x10, 2), BusRange(0x10, 2)); - - assert!(BusRange(0x10, 2) < BusRange(0x12, 1)); - assert!(BusRange(0x10, 2) < BusRange(0x12, 3)); - - let mut bus = Bus::new(); - let mut data = [1, 2, 3, 4]; - bus.insert( - Arc::new(Mutex::new(BusDevice::Dummy(DummyDevice))), - 0x10, - 0x10, - ) - .unwrap(); - assert!(bus.write(0x10, &data)); - let bus_clone = bus.clone(); - assert!(bus.read(0x10, &mut data)); - assert_eq!(data, [1, 2, 3, 4]); - assert!(bus_clone.read(0x10, &mut data)); - assert_eq!(data, [1, 2, 3, 4]); - } - - #[test] - fn test_display_error() { - assert_eq!( - format!("{}", BusError::Overlap), - "New device overlaps with an old device." - ); - } -} diff --git a/src/vmm/src/devices/legacy/i8042.rs b/src/vmm/src/devices/legacy/i8042.rs index bcf7bdd8c90..235ce2a7339 100644 --- a/src/vmm/src/devices/legacy/i8042.rs +++ b/src/vmm/src/devices/legacy/i8042.rs @@ -7,6 +7,7 @@ use std::io; use std::num::Wrapping; +use std::sync::{Arc, Barrier}; use log::warn; use serde::Serialize; @@ -96,7 +97,7 @@ pub struct I8042Device { reset_evt: EventFd, /// Keyboard interrupt event (IRQ 1). - kbd_interrupt_evt: EventFd, + pub kbd_interrupt_evt: EventFd, /// The i8042 status register. status: u8, @@ -118,10 +119,10 @@ pub struct I8042Device { impl I8042Device { /// Constructs an i8042 device that will signal the given event when the guest requests it. - pub fn new(reset_evt: EventFd, kbd_interrupt_evt: EventFd) -> I8042Device { - I8042Device { + pub fn new(reset_evt: EventFd) -> Result { + Ok(I8042Device { reset_evt, - kbd_interrupt_evt, + kbd_interrupt_evt: EventFd::new(libc::EFD_NONBLOCK)?, control: CB_POST_OK | CB_KBD_INT, cmd: 0, outp: 0, @@ -129,7 +130,7 @@ impl I8042Device { buf: [0; BUF_SIZE], bhead: Wrapping(0), btail: Wrapping(0), - } + }) } /// Signal a ctrl-alt-del (reset) event. @@ -209,8 +210,8 @@ impl I8042Device { } } -impl I8042Device { - pub fn bus_read(&mut self, offset: u64, data: &mut [u8]) { +impl vm_device::BusDevice for I8042Device { + fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { // All our ports are byte-wide. We don't know how to handle any wider data. if data.len() != 1 { METRICS.missed_read_count.inc(); @@ -245,11 +246,11 @@ impl I8042Device { } } - pub fn bus_write(&mut self, offset: u64, data: &[u8]) { + fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { // All our ports are byte-wide. We don't know how to handle any wider data. if data.len() != 1 { METRICS.missed_write_count.inc(); - return; + return None; } let mut write_ok = true; @@ -335,11 +336,15 @@ impl I8042Device { } else { METRICS.missed_write_count.inc(); } + + None } } #[cfg(test)] mod tests { + use vm_device::BusDevice; + use super::*; impl PartialEq for I8042Error { @@ -350,17 +355,14 @@ mod tests { #[test] fn test_i8042_read_write_and_event() { - let mut i8042 = I8042Device::new( - EventFd::new(libc::EFD_NONBLOCK).unwrap(), - EventFd::new(libc::EFD_NONBLOCK).unwrap(), - ); + let mut i8042 = I8042Device::new(EventFd::new(libc::EFD_NONBLOCK).unwrap()).unwrap(); let reset_evt = i8042.reset_evt.try_clone().unwrap(); // Check if reading in a 2-length array doesn't have side effects. let mut data = [1, 2]; - i8042.bus_read(0, &mut data); + i8042.read(0x0, 0, &mut data); assert_eq!(data, [1, 2]); - i8042.bus_read(1, &mut data); + i8042.read(0x0, 1, &mut data); assert_eq!(data, [1, 2]); // Check if reset works. @@ -368,72 +370,66 @@ mod tests { // counter doesn't change (for 0 it blocks). reset_evt.write(1).unwrap(); let mut data = [CMD_RESET_CPU]; - i8042.bus_write(OFS_STATUS, &data); + i8042.write(0x0, OFS_STATUS, &data); assert_eq!(reset_evt.read().unwrap(), 2); // Check if reading with offset 1 doesn't have side effects. - i8042.bus_read(1, &mut data); + i8042.read(0x0, 1, &mut data); assert_eq!(data[0], CMD_RESET_CPU); // Check invalid `write`s. let before = METRICS.missed_write_count.count(); // offset != 0. - i8042.bus_write(1, &data); + i8042.write(0x0, 1, &data); // data != CMD_RESET_CPU data[0] = CMD_RESET_CPU + 1; - i8042.bus_write(1, &data); + i8042.write(0x0, 1, &data); // data.len() != 1 let data = [CMD_RESET_CPU; 2]; - i8042.bus_write(1, &data); + i8042.write(0x0, 1, &data); assert_eq!(METRICS.missed_write_count.count(), before + 3); } #[test] fn test_i8042_commands() { - let mut i8042 = I8042Device::new( - EventFd::new(libc::EFD_NONBLOCK).unwrap(), - EventFd::new(libc::EFD_NONBLOCK).unwrap(), - ); + let mut i8042 = I8042Device::new(EventFd::new(libc::EFD_NONBLOCK).unwrap()).unwrap(); let mut data = [1]; // Test reading/writing the control register. data[0] = CMD_WRITE_CTR; - i8042.bus_write(OFS_STATUS, &data); + i8042.write(0x0, OFS_STATUS, &data); assert_ne!(i8042.status & SB_I8042_CMD_DATA, 0); data[0] = 0x52; - i8042.bus_write(OFS_DATA, &data); + i8042.write(0x0, OFS_DATA, &data); data[0] = CMD_READ_CTR; - i8042.bus_write(OFS_STATUS, &data); + i8042.write(0x0, OFS_STATUS, &data); assert_ne!(i8042.status & SB_OUT_DATA_AVAIL, 0); - i8042.bus_read(OFS_DATA, &mut data); + i8042.read(0x0, OFS_DATA, &mut data); assert_eq!(data[0], 0x52); // Test reading/writing the output port. data[0] = CMD_WRITE_OUTP; - i8042.bus_write(OFS_STATUS, &data); + i8042.write(0x0, OFS_STATUS, &data); assert_ne!(i8042.status & SB_I8042_CMD_DATA, 0); data[0] = 0x52; - i8042.bus_write(OFS_DATA, &data); + i8042.write(0x0, OFS_DATA, &data); data[0] = CMD_READ_OUTP; - i8042.bus_write(OFS_STATUS, &data); + i8042.write(0x0, OFS_STATUS, &data); assert_ne!(i8042.status & SB_OUT_DATA_AVAIL, 0); - i8042.bus_read(OFS_DATA, &mut data); + i8042.read(0x0, OFS_DATA, &mut data); assert_eq!(data[0], 0x52); // Test kbd commands. data[0] = 0x52; - i8042.bus_write(OFS_DATA, &data); + i8042.write(0x0, OFS_DATA, &data); assert_ne!(i8042.status & SB_OUT_DATA_AVAIL, 0); - i8042.bus_read(OFS_DATA, &mut data); + i8042.read(0x0, OFS_DATA, &mut data); assert_eq!(data[0], 0xFA); } #[test] fn test_i8042_buffer() { - let mut i8042 = I8042Device::new( - EventFd::new(libc::EFD_NONBLOCK).unwrap(), - EventFd::new(libc::EFD_NONBLOCK).unwrap(), - ); + let mut i8042 = I8042Device::new(EventFd::new(libc::EFD_NONBLOCK).unwrap()).unwrap(); // Test push/pop. i8042.push_byte(52).unwrap(); @@ -457,10 +453,7 @@ mod tests { #[test] fn test_i8042_kbd() { - let mut i8042 = I8042Device::new( - EventFd::new(libc::EFD_NONBLOCK).unwrap(), - EventFd::new(libc::EFD_NONBLOCK).unwrap(), - ); + let mut i8042 = I8042Device::new(EventFd::new(libc::EFD_NONBLOCK).unwrap()).unwrap(); fn expect_key(i8042: &mut I8042Device, key: u16) { let mut data = [1]; @@ -470,13 +463,13 @@ mod tests { assert!(i8042.kbd_interrupt_evt.read().unwrap() > 1); // The "data available" flag should be on. - i8042.bus_read(OFS_STATUS, &mut data); + i8042.read(0x0, OFS_STATUS, &mut data); let mut key_byte: u8; if key & 0xFF00 != 0 { // For extended keys, we should be able to read the MSB first. key_byte = ((key & 0xFF00) >> 8) as u8; - i8042.bus_read(OFS_DATA, &mut data); + i8042.read(0x0, OFS_DATA, &mut data); assert_eq!(data[0], key_byte); // And then do the same for the LSB. @@ -485,10 +478,10 @@ mod tests { i8042.trigger_kbd_interrupt().unwrap(); assert!(i8042.kbd_interrupt_evt.read().unwrap() > 1); // The "data available" flag should be on. - i8042.bus_read(OFS_STATUS, &mut data); + i8042.read(0x0, OFS_STATUS, &mut data); } key_byte = (key & 0xFF) as u8; - i8042.bus_read(OFS_DATA, &mut data); + i8042.read(0x0, OFS_DATA, &mut data); assert_eq!(data[0], key_byte); } @@ -530,9 +523,9 @@ mod tests { // Test kbd interrupt disable. let mut data = [1]; data[0] = CMD_WRITE_CTR; - i8042.bus_write(OFS_STATUS, &data); + i8042.write(0x0, OFS_STATUS, &data); data[0] = i8042.control & !CB_KBD_INT; - i8042.bus_write(OFS_DATA, &data); + i8042.write(0x0, OFS_DATA, &data); i8042.trigger_key(KEY_CTRL).unwrap(); assert_eq!( i8042.trigger_kbd_interrupt().unwrap_err(), diff --git a/src/vmm/src/devices/legacy/rtc_pl031.rs b/src/vmm/src/devices/legacy/rtc_pl031.rs index 754899a23a4..b025c1d1512 100644 --- a/src/vmm/src/devices/legacy/rtc_pl031.rs +++ b/src/vmm/src/devices/legacy/rtc_pl031.rs @@ -4,6 +4,7 @@ use std::convert::TryInto; use serde::Serialize; +use vm_superio::Rtc; use vm_superio::rtc_pl031::RtcEvents; use crate::logger::{IncMetric, SharedIncMetric, warn}; @@ -59,7 +60,19 @@ pub static METRICS: RTCDeviceMetrics = RTCDeviceMetrics::new(); /// Wrapper over vm_superio's RTC implementation. #[derive(Debug)] -pub struct RTCDevice(pub vm_superio::Rtc<&'static RTCDeviceMetrics>); +pub struct RTCDevice(vm_superio::Rtc<&'static RTCDeviceMetrics>); + +impl Default for RTCDevice { + fn default() -> Self { + RTCDevice(Rtc::with_events(&METRICS)) + } +} + +impl RTCDevice { + pub fn new() -> RTCDevice { + Default::default() + } +} impl std::ops::Deref for RTCDevice { type Target = vm_superio::Rtc<&'static RTCDeviceMetrics>; @@ -80,7 +93,7 @@ impl RTCDevice { pub fn bus_read(&mut self, offset: u64, data: &mut [u8]) { if let (Ok(offset), 4) = (u16::try_from(offset), data.len()) { // read() function from RTC implementation expects a slice of - // len 4, and we just validated that this is the data lengt + // len 4, and we just validated that this is the data length self.read(offset, data.try_into().unwrap()) } else { warn!( @@ -108,6 +121,23 @@ impl RTCDevice { } } +#[cfg(target_arch = "aarch64")] +impl vm_device::BusDevice for RTCDevice { + fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { + self.bus_read(offset, data) + } + + fn write( + &mut self, + _base: u64, + offset: u64, + data: &[u8], + ) -> Option> { + self.bus_write(offset, data); + None + } +} + #[cfg(test)] mod tests { use vm_superio::Rtc; diff --git a/src/vmm/src/devices/legacy/serial.rs b/src/vmm/src/devices/legacy/serial.rs index 278c15a4464..afc47189c1e 100644 --- a/src/vmm/src/devices/legacy/serial.rs +++ b/src/vmm/src/devices/legacy/serial.rs @@ -7,16 +7,18 @@ //! Implements a wrapper over an UART serial device. use std::fmt::Debug; -use std::io; -use std::io::{Read, Write}; +use std::io::{self, Read, Stdin, Write}; use std::os::unix::io::{AsRawFd, RawFd}; +use std::sync::{Arc, Barrier}; use event_manager::{EventOps, Events, MutEventSubscriber}; +use libc::EFD_NONBLOCK; use log::{error, warn}; use serde::Serialize; use vm_superio::serial::{Error as SerialError, SerialEvents}; use vm_superio::{Serial, Trigger}; use vmm_sys_util::epoll::EventSet; +use vmm_sys_util::eventfd::EventFd; use crate::devices::legacy::EventFdTrigger; use crate::logger::{IncMetric, SharedIncMetric}; @@ -220,7 +222,27 @@ impl SerialWrapper = SerialWrapper; +pub type SerialDevice = SerialWrapper; + +impl SerialDevice { + pub fn new(serial_in: Option, serial_out: SerialOut) -> Result { + let interrupt_evt = EventFdTrigger::new(EventFd::new(EFD_NONBLOCK)?); + let buffer_read_event_fd = EventFdTrigger::new(EventFd::new(EFD_NONBLOCK)?); + + let serial = Serial::with_events( + interrupt_evt, + SerialEventsWrapper { + buffer_ready_event_fd: Some(buffer_read_event_fd), + }, + serial_out, + ); + + Ok(SerialDevice { + serial, + input: serial_in, + }) + } +} impl MutEventSubscriber for SerialWrapper @@ -337,10 +359,11 @@ fn is_fifo(fd: RawFd) -> bool { (stat.st_mode & libc::S_IFIFO) != 0 } -impl - SerialWrapper +impl vm_device::BusDevice for SerialWrapper +where + I: Read + AsRawFd + Send, { - pub fn bus_read(&mut self, offset: u64, data: &mut [u8]) { + fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { if let (Ok(offset), 1) = (u8::try_from(offset), data.len()) { data[0] = self.serial.read(offset); } else { @@ -348,7 +371,7 @@ impl } } - pub fn bus_write(&mut self, offset: u64, data: &[u8]) { + fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { if let (Ok(offset), 1) = (u8::try_from(offset), data.len()) { if let Err(err) = self.serial.write(offset, data[0]) { // Counter incremented for any handle_write() error. @@ -358,6 +381,7 @@ impl } else { METRICS.missed_write_count.inc(); } + None } } @@ -365,6 +389,7 @@ impl mod tests { #![allow(clippy::undocumented_unsafe_blocks)] + use vm_device::BusDevice; use vmm_sys_util::eventfd::EventFd; use super::*; @@ -390,13 +415,13 @@ mod tests { let invalid_reads_before = metrics.missed_read_count.count(); let mut v = [0x00; 2]; - serial.bus_read(0u64, &mut v); + serial.read(0x0, 0u64, &mut v); let invalid_reads_after = metrics.missed_read_count.count(); assert_eq!(invalid_reads_before + 1, invalid_reads_after); let mut v = [0x00; 1]; - serial.bus_read(0u64, &mut v); + serial.read(0x0, 0u64, &mut v); assert_eq!(v[0], b'a'); let invalid_reads_after_2 = metrics.missed_read_count.count(); diff --git a/src/vmm/src/devices/mod.rs b/src/vmm/src/devices/mod.rs index 495e1507edd..371cc2cfa9e 100644 --- a/src/vmm/src/devices/mod.rs +++ b/src/vmm/src/devices/mod.rs @@ -7,15 +7,16 @@ //! Emulates virtual and hardware devices. +#![allow(unused)] + use std::io; pub mod acpi; -pub mod bus; pub mod legacy; +pub mod pci; pub mod pseudo; pub mod virtio; -pub use bus::{Bus, BusDevice, BusError}; use log::error; use crate::devices::virtio::net::metrics::NetDeviceMetrics; diff --git a/src/vmm/src/devices/pci/mod.rs b/src/vmm/src/devices/pci/mod.rs new file mode 100644 index 00000000000..e365b481893 --- /dev/null +++ b/src/vmm/src/devices/pci/mod.rs @@ -0,0 +1,6 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +pub mod pci_segment; + +pub use pci_segment::*; diff --git a/src/vmm/src/devices/pci/pci_segment.rs b/src/vmm/src/devices/pci/pci_segment.rs new file mode 100644 index 00000000000..7deaa027f7b --- /dev/null +++ b/src/vmm/src/devices/pci/pci_segment.rs @@ -0,0 +1,555 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// Copyright © 2019 - 2021 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause +// + +use std::sync::{Arc, Mutex}; + +#[cfg(target_arch = "x86_64")] +use acpi_tables::{Aml, aml}; +use log::info; +#[cfg(target_arch = "x86_64")] +use pci::{PCI_CONFIG_IO_PORT, PCI_CONFIG_IO_PORT_SIZE, PciConfigIo}; +use pci::{PciBdf, PciBus, PciConfigMmio, PciRoot, PciRootError}; +use uuid::Uuid; +use vm_allocator::AddressAllocator; +use vm_device::{BusDeviceSync, BusError}; + +use crate::arch::{ArchVm as Vm, PCI_MMCONFIG_START, PCI_MMIO_CONFIG_SIZE_PER_SEGMENT}; +use crate::vstate::resources::ResourceAllocator; + +pub struct PciSegment { + pub(crate) id: u16, + pub(crate) pci_bus: Arc>, + pub(crate) pci_config_mmio: Arc>, + pub(crate) mmio_config_address: u64, + pub(crate) proximity_domain: u32, + + #[cfg(target_arch = "x86_64")] + pub(crate) pci_config_io: Option>>, + + // Bitmap of PCI devices to hotplug. + pub(crate) pci_devices_up: u32, + // Bitmap of PCI devices to hotunplug. + pub(crate) pci_devices_down: u32, + // List of allocated IRQs for each PCI slot. + pub(crate) pci_irq_slots: [u8; 32], + + // Device memory covered by this segment + pub(crate) start_of_mem32_area: u64, + pub(crate) end_of_mem32_area: u64, + + pub(crate) start_of_mem64_area: u64, + pub(crate) end_of_mem64_area: u64, +} + +impl std::fmt::Debug for PciSegment { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("PciSegment") + .field("id", &self.id) + .field("mmio_config_address", &self.mmio_config_address) + .field("proximity_domain", &self.proximity_domain) + .field("pci_devices_up", &self.pci_devices_up) + .field("pci_devices_down", &self.pci_devices_down) + .field("pci_irq_slots", &self.pci_irq_slots) + .field("start_of_mem32_area", &self.start_of_mem32_area) + .field("end_of_mem32_area", &self.end_of_mem32_area) + .field("start_of_mem64_area", &self.start_of_mem64_area) + .field("end_of_mem64_area", &self.end_of_mem64_area) + .finish() + } +} + +impl PciSegment { + fn build(id: u16, vm: &Arc, pci_irq_slots: &[u8; 32]) -> Result { + let pci_root = PciRoot::new(None); + let pci_bus = Arc::new(Mutex::new(PciBus::new(pci_root, vm.clone()))); + + let pci_config_mmio = Arc::new(Mutex::new(PciConfigMmio::new(Arc::clone(&pci_bus)))); + let mmio_config_address = PCI_MMCONFIG_START + PCI_MMIO_CONFIG_SIZE_PER_SEGMENT * id as u64; + + vm.common.mmio_bus.insert( + Arc::clone(&pci_config_mmio) as Arc, + mmio_config_address, + PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, + )?; + + let resource_allocator = vm.resource_allocator(); + + let start_of_mem32_area = resource_allocator.mmio32_memory.base(); + let end_of_mem32_area = resource_allocator.mmio32_memory.end(); + + let start_of_mem64_area = resource_allocator.mmio64_memory.base(); + let end_of_mem64_area = resource_allocator.mmio64_memory.end(); + + let segment = PciSegment { + id, + pci_bus, + pci_config_mmio, + mmio_config_address, + proximity_domain: 0, + pci_devices_up: 0, + pci_devices_down: 0, + #[cfg(target_arch = "x86_64")] + pci_config_io: None, + start_of_mem32_area, + end_of_mem32_area, + start_of_mem64_area, + end_of_mem64_area, + pci_irq_slots: *pci_irq_slots, + }; + + Ok(segment) + } + + #[cfg(target_arch = "x86_64")] + pub(crate) fn new( + id: u16, + vm: &Arc, + pci_irq_slots: &[u8; 32], + ) -> Result { + use crate::Vm; + + let mut segment = Self::build(id, vm, pci_irq_slots)?; + let pci_config_io = Arc::new(Mutex::new(PciConfigIo::new(Arc::clone(&segment.pci_bus)))); + + vm.pio_bus.insert( + pci_config_io.clone(), + PCI_CONFIG_IO_PORT, + PCI_CONFIG_IO_PORT_SIZE, + )?; + + segment.pci_config_io = Some(pci_config_io); + + info!( + "pci: adding PCI segment: id={:#x}, PCI MMIO config address: {:#x}, mem32 area: \ + [{:#x}-{:#x}], mem64 area: [{:#x}-{:#x}] IO area: [{PCI_CONFIG_IO_PORT:#x}-{:#x}]", + segment.id, + segment.mmio_config_address, + segment.start_of_mem32_area, + segment.end_of_mem32_area, + segment.start_of_mem64_area, + segment.end_of_mem64_area, + PCI_CONFIG_IO_PORT + PCI_CONFIG_IO_PORT_SIZE - 1 + ); + + Ok(segment) + } + + #[cfg(target_arch = "aarch64")] + pub(crate) fn new( + id: u16, + vm: &Arc, + pci_irq_slots: &[u8; 32], + ) -> Result { + let segment = Self::build(id, vm, pci_irq_slots)?; + info!( + "pci: adding PCI segment: id={:#x}, PCI MMIO config address: {:#x}, mem32 area: \ + [{:#x}-{:#x}], mem64 area: [{:#x}-{:#x}]", + segment.id, + segment.mmio_config_address, + segment.start_of_mem32_area, + segment.end_of_mem32_area, + segment.start_of_mem64_area, + segment.end_of_mem64_area, + ); + + Ok(segment) + } + + pub(crate) fn next_device_bdf(&self) -> Result { + Ok(PciBdf::new( + self.id, + 0, + self.pci_bus + .lock() + .unwrap() + .next_device_id()? + .try_into() + .unwrap(), + 0, + )) + } +} + +#[cfg(target_arch = "x86_64")] +struct PciDevSlot { + device_id: u8, +} + +#[cfg(target_arch = "x86_64")] +impl Aml for PciDevSlot { + fn append_aml_bytes(&self, v: &mut Vec) -> Result<(), aml::AmlError> { + let sun = self.device_id; + let adr: u32 = (self.device_id as u32) << 16; + aml::Device::new( + format!("S{:03}", self.device_id).as_str().try_into()?, + vec![ + &aml::Name::new("_SUN".try_into()?, &sun)?, + &aml::Name::new("_ADR".try_into()?, &adr)?, + &aml::Method::new( + "_EJ0".try_into()?, + 1, + true, + vec![&aml::MethodCall::new( + "\\_SB_.PHPR.PCEJ".try_into()?, + vec![&aml::Path::new("_SUN")?, &aml::Path::new("_SEG")?], + )], + ), + ], + ) + .append_aml_bytes(v) + } +} + +#[cfg(target_arch = "x86_64")] +struct PciDevSlotNotify { + device_id: u8, +} + +#[cfg(target_arch = "x86_64")] +impl Aml for PciDevSlotNotify { + fn append_aml_bytes(&self, v: &mut Vec) -> Result<(), aml::AmlError> { + let device_id_mask: u32 = 1 << self.device_id; + let object = aml::Path::new(&format!("S{:03}", self.device_id))?; + aml::And::new(&aml::Local(0), &aml::Arg(0), &device_id_mask).append_aml_bytes(v)?; + aml::If::new( + &aml::Equal::new(&aml::Local(0), &device_id_mask), + vec![&aml::Notify::new(&object, &aml::Arg(1))], + ) + .append_aml_bytes(v) + } +} + +#[cfg(target_arch = "x86_64")] +struct PciDevSlotMethods {} + +#[cfg(target_arch = "x86_64")] +impl Aml for PciDevSlotMethods { + fn append_aml_bytes(&self, v: &mut Vec) -> Result<(), aml::AmlError> { + let mut device_notifies = Vec::new(); + for device_id in 0..32 { + device_notifies.push(PciDevSlotNotify { device_id }); + } + + let mut device_notifies_refs: Vec<&dyn Aml> = Vec::new(); + for device_notify in device_notifies.iter() { + device_notifies_refs.push(device_notify); + } + + aml::Method::new("DVNT".try_into()?, 2, true, device_notifies_refs).append_aml_bytes(v)?; + aml::Method::new( + "PCNT".try_into()?, + 0, + true, + vec![ + &aml::Acquire::new("\\_SB_.PHPR.BLCK".try_into()?, 0xffff), + &aml::Store::new( + &aml::Path::new("\\_SB_.PHPR.PSEG")?, + &aml::Path::new("_SEG")?, + ), + &aml::MethodCall::new( + "DVNT".try_into()?, + vec![&aml::Path::new("\\_SB_.PHPR.PCIU")?, &aml::ONE], + ), + &aml::MethodCall::new( + "DVNT".try_into()?, + vec![&aml::Path::new("\\_SB_.PHPR.PCID")?, &3usize], + ), + &aml::Release::new("\\_SB_.PHPR.BLCK".try_into()?), + ], + ) + .append_aml_bytes(v) + } +} + +#[cfg(target_arch = "x86_64")] +struct PciDsmMethod {} + +#[cfg(target_arch = "x86_64")] +impl Aml for PciDsmMethod { + fn append_aml_bytes(&self, v: &mut Vec) -> Result<(), aml::AmlError> { + // Refer to ACPI spec v6.3 Ch 9.1.1 and PCI Firmware spec v3.3 Ch 4.6.1 + // _DSM (Device Specific Method), the following is the implementation in ASL. + + // Method (_DSM, 4, NotSerialized) // _DSM: Device-Specific Method + // { + // If ((Arg0 == ToUUID ("e5c937d0-3553-4d7a-9117-ea4d19c3434d") /* Device Labeling + // Interface */)) { + // If ((Arg2 == Zero)) + // { + // Return (Buffer (One) { 0x21 }) + // } + // If ((Arg2 == 0x05)) + // { + // Return (Zero) + // } + // } + // + // Return (Buffer (One) { 0x00 }) + // } + // + // As per ACPI v6.3 Ch 19.6.142, the UUID is required to be in mixed endian: + // Among the fields of a UUID: + // {d1 (8 digits)} - {d2 (4 digits)} - {d3 (4 digits)} - {d4 (16 digits)} + // d1 ~ d3 need to be little endian, d4 be big endian. + // See https://en.wikipedia.org/wiki/Universally_unique_identifier#Encoding . + let uuid = Uuid::parse_str("E5C937D0-3553-4D7A-9117-EA4D19C3434D").unwrap(); + let (uuid_d1, uuid_d2, uuid_d3, uuid_d4) = uuid.as_fields(); + let mut uuid_buf = vec![]; + uuid_buf.extend(uuid_d1.to_le_bytes()); + uuid_buf.extend(uuid_d2.to_le_bytes()); + uuid_buf.extend(uuid_d3.to_le_bytes()); + uuid_buf.extend(uuid_d4); + aml::Method::new( + "_DSM".try_into()?, + 4, + false, + vec![ + &aml::If::new( + &aml::Equal::new(&aml::Arg(0), &aml::Buffer::new(uuid_buf)), + vec![ + &aml::If::new( + &aml::Equal::new(&aml::Arg(2), &aml::ZERO), + vec![&aml::Return::new(&aml::Buffer::new(vec![0x21]))], + ), + &aml::If::new( + &aml::Equal::new(&aml::Arg(2), &0x05u8), + vec![&aml::Return::new(&aml::ZERO)], + ), + ], + ), + &aml::Return::new(&aml::Buffer::new(vec![0])), + ], + ) + .append_aml_bytes(v) + } +} + +#[cfg(target_arch = "x86_64")] +impl Aml for PciSegment { + fn append_aml_bytes(&self, v: &mut Vec) -> Result<(), aml::AmlError> { + let mut pci_dsdt_inner_data: Vec<&dyn Aml> = Vec::new(); + let hid = aml::Name::new("_HID".try_into()?, &aml::EisaName::new("PNP0A08")?)?; + pci_dsdt_inner_data.push(&hid); + let cid = aml::Name::new("_CID".try_into()?, &aml::EisaName::new("PNP0A03")?)?; + pci_dsdt_inner_data.push(&cid); + let adr = aml::Name::new("_ADR".try_into()?, &aml::ZERO)?; + pci_dsdt_inner_data.push(&adr); + let seg = aml::Name::new("_SEG".try_into()?, &self.id)?; + pci_dsdt_inner_data.push(&seg); + let uid = aml::Name::new("_UID".try_into()?, &aml::ZERO)?; + pci_dsdt_inner_data.push(&uid); + let cca = aml::Name::new("_CCA".try_into()?, &aml::ONE)?; + pci_dsdt_inner_data.push(&cca); + let supp = aml::Name::new("SUPP".try_into()?, &aml::ZERO)?; + pci_dsdt_inner_data.push(&supp); + + let proximity_domain = self.proximity_domain; + let pxm_return = aml::Return::new(&proximity_domain); + let pxm = aml::Method::new("_PXM".try_into()?, 0, false, vec![&pxm_return]); + pci_dsdt_inner_data.push(&pxm); + + let pci_dsm = PciDsmMethod {}; + pci_dsdt_inner_data.push(&pci_dsm); + + #[allow(clippy::if_same_then_else)] + let crs = if self.id == 0 { + aml::Name::new( + "_CRS".try_into()?, + &aml::ResourceTemplate::new(vec![ + &aml::AddressSpace::new_bus_number(0x0u16, 0x0u16)?, + &aml::Io::new(0xcf8, 0xcf8, 1, 0x8), + &aml::Memory32Fixed::new( + true, + self.mmio_config_address.try_into().unwrap(), + PCI_MMIO_CONFIG_SIZE_PER_SEGMENT.try_into().unwrap(), + ), + &aml::AddressSpace::new_memory( + aml::AddressSpaceCacheable::NotCacheable, + true, + self.start_of_mem32_area, + self.end_of_mem32_area, + )?, + &aml::AddressSpace::new_memory( + aml::AddressSpaceCacheable::NotCacheable, + true, + self.start_of_mem64_area, + self.end_of_mem64_area, + )?, + &aml::AddressSpace::new_io(0u16, 0x0cf7u16)?, + &aml::AddressSpace::new_io(0x0d00u16, 0xffffu16)?, + ]), + )? + } else { + aml::Name::new( + "_CRS".try_into()?, + &aml::ResourceTemplate::new(vec![ + &aml::AddressSpace::new_bus_number(0x0u16, 0x0u16)?, + &aml::Memory32Fixed::new( + true, + self.mmio_config_address.try_into().unwrap(), + PCI_MMIO_CONFIG_SIZE_PER_SEGMENT.try_into().unwrap(), + ), + &aml::AddressSpace::new_memory( + aml::AddressSpaceCacheable::NotCacheable, + true, + self.start_of_mem32_area, + self.end_of_mem32_area, + )?, + &aml::AddressSpace::new_memory( + aml::AddressSpaceCacheable::NotCacheable, + true, + self.start_of_mem64_area, + self.end_of_mem64_area, + )?, + ]), + )? + }; + pci_dsdt_inner_data.push(&crs); + + let mut pci_devices = Vec::new(); + for device_id in 0..32 { + let pci_device = PciDevSlot { device_id }; + pci_devices.push(pci_device); + } + for pci_device in pci_devices.iter() { + pci_dsdt_inner_data.push(pci_device); + } + + let pci_device_methods = PciDevSlotMethods {}; + pci_dsdt_inner_data.push(&pci_device_methods); + + // Build PCI routing table, listing IRQs assigned to PCI devices. + let prt_package_list: Vec<(u32, u32)> = self + .pci_irq_slots + .iter() + .enumerate() + .map(|(i, irq)| { + ( + ((((u32::try_from(i).unwrap()) & 0x1fu32) << 16) | 0xffffu32), + *irq as u32, + ) + }) + .collect(); + let prt_package_list: Vec = prt_package_list + .iter() + .map(|(bdf, irq)| aml::Package::new(vec![bdf, &0u8, &0u8, irq])) + .collect(); + let prt_package_list: Vec<&dyn Aml> = prt_package_list + .iter() + .map(|item| item as &dyn Aml) + .collect(); + let prt = aml::Name::new("_PRT".try_into()?, &aml::Package::new(prt_package_list))?; + pci_dsdt_inner_data.push(&prt); + + aml::Device::new( + format!("_SB_.PC{:02X}", self.id).as_str().try_into()?, + pci_dsdt_inner_data, + ) + .append_aml_bytes(v) + } +} + +#[cfg(test)] +mod tests { + + use super::*; + use crate::arch; + use crate::builder::tests::default_vmm; + use crate::utils::u64_to_usize; + + #[test] + fn test_pci_segment_build() { + let vmm = default_vmm(); + let pci_irq_slots = &[0u8; 32]; + let pci_segment = PciSegment::new(0, &vmm.vm, pci_irq_slots).unwrap(); + + assert_eq!(pci_segment.id, 0); + assert_eq!( + pci_segment.start_of_mem32_area, + arch::MEM_32BIT_DEVICES_START + ); + assert_eq!( + pci_segment.end_of_mem32_area, + arch::MEM_32BIT_DEVICES_START + arch::MEM_32BIT_DEVICES_SIZE - 1 + ); + assert_eq!( + pci_segment.start_of_mem64_area, + arch::MEM_64BIT_DEVICES_START + ); + assert_eq!( + pci_segment.end_of_mem64_area, + arch::MEM_64BIT_DEVICES_START + arch::MEM_64BIT_DEVICES_SIZE - 1 + ); + assert_eq!(pci_segment.mmio_config_address, arch::PCI_MMCONFIG_START); + assert_eq!(pci_segment.proximity_domain, 0); + assert_eq!(pci_segment.pci_devices_up, 0); + assert_eq!(pci_segment.pci_devices_down, 0); + assert_eq!(pci_segment.pci_irq_slots, [0u8; 32]); + } + + #[cfg(target_arch = "x86_64")] + #[test] + fn test_io_bus() { + let vmm = default_vmm(); + let pci_irq_slots = &[0u8; 32]; + let pci_segment = PciSegment::new(0, &vmm.vm, pci_irq_slots).unwrap(); + + let mut data = [0u8; u64_to_usize(PCI_CONFIG_IO_PORT_SIZE)]; + vmm.vm.pio_bus.read(PCI_CONFIG_IO_PORT, &mut data).unwrap(); + + vmm.vm + .pio_bus + .read(PCI_CONFIG_IO_PORT + PCI_CONFIG_IO_PORT_SIZE, &mut data) + .unwrap_err(); + } + + #[test] + fn test_mmio_bus() { + let vmm = default_vmm(); + let pci_irq_slots = &[0u8; 32]; + let pci_segment = PciSegment::new(0, &vmm.vm, pci_irq_slots).unwrap(); + + let mut data = [0u8; u64_to_usize(PCI_MMIO_CONFIG_SIZE_PER_SEGMENT)]; + + vmm.vm + .common + .mmio_bus + .read(pci_segment.mmio_config_address, &mut data) + .unwrap(); + vmm.vm + .common + .mmio_bus + .read( + pci_segment.mmio_config_address + PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, + &mut data, + ) + .unwrap_err(); + } + + #[test] + fn test_next_device_bdf() { + let vmm = default_vmm(); + let pci_irq_slots = &[0u8; 32]; + let pci_segment = PciSegment::new(0, &vmm.vm, pci_irq_slots).unwrap(); + + // Start checking from device id 1, since 0 is allocated to the Root port. + for dev_id in 1..32 { + let bdf = pci_segment.next_device_bdf().unwrap(); + // In our case we have a single Segment with id 0, which has + // a single bus with id 0. Also, each device of ours has a + // single function. + assert_eq!(bdf, PciBdf::new(0, 0, dev_id, 0)); + } + + // We can only have 32 devices on a segment + pci_segment.next_device_bdf().unwrap_err(); + } +} diff --git a/src/vmm/src/devices/pseudo/boot_timer.rs b/src/vmm/src/devices/pseudo/boot_timer.rs index ba16e92355f..f0cf38977b5 100644 --- a/src/vmm/src/devices/pseudo/boot_timer.rs +++ b/src/vmm/src/devices/pseudo/boot_timer.rs @@ -1,6 +1,8 @@ // Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +use std::sync::{Arc, Barrier}; + use utils::time::TimestampUs; use crate::logger::info; @@ -8,16 +10,16 @@ use crate::logger::info; const MAGIC_VALUE_SIGNAL_GUEST_BOOT_COMPLETE: u8 = 123; /// Pseudo device to record the kernel boot time. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct BootTimer { start_ts: TimestampUs, } -impl BootTimer { - pub fn bus_write(&mut self, offset: u64, data: &[u8]) { +impl vm_device::BusDevice for BootTimer { + fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { // Only handle byte length instructions at a zero offset. if data.len() != 1 || offset != 0 { - return; + return None; } if data[0] == MAGIC_VALUE_SIGNAL_GUEST_BOOT_COMPLETE { @@ -33,8 +35,11 @@ impl BootTimer { boot_time_cpu_us / 1000 ); } + + None } - pub fn bus_read(&mut self, _offset: u64, _data: &[u8]) {} + + fn read(&mut self, _base: u64, _offset: u64, _data: &mut [u8]) {} } impl BootTimer { diff --git a/src/vmm/src/devices/virtio/balloon/device.rs b/src/vmm/src/devices/virtio/balloon/device.rs index 8962c992cf8..4586592182c 100644 --- a/src/vmm/src/devices/virtio/balloon/device.rs +++ b/src/vmm/src/devices/virtio/balloon/device.rs @@ -1,9 +1,11 @@ // Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +use std::ops::Deref; +use std::sync::Arc; use std::time::Duration; -use log::error; +use log::{error, info}; use serde::Serialize; use timerfd::{ClockId, SetTimeFlags, TimerFd, TimerState}; use vmm_sys_util::eventfd::EventFd; @@ -23,9 +25,10 @@ use super::{ VIRTIO_BALLOON_S_SWAP_OUT, }; use crate::devices::virtio::balloon::BalloonError; -use crate::devices::virtio::device::{IrqTrigger, IrqType}; +use crate::devices::virtio::device::ActiveState; use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; use crate::devices::virtio::queue::InvalidAvailIdx; +use crate::devices::virtio::transport::{VirtioInterrupt, VirtioInterruptType}; use crate::logger::IncMetric; use crate::utils::u64_to_usize; use crate::vstate::memory::{Address, ByteValued, Bytes, GuestAddress, GuestMemoryMmap}; @@ -162,7 +165,6 @@ pub struct Balloon { pub(crate) queues: Vec, pub(crate) queue_evts: [EventFd; BALLOON_NUM_QUEUES], pub(crate) device_state: DeviceState, - pub(crate) irq_trigger: IrqTrigger, // Implementation specific fields. pub(crate) restored_from_file: bool, @@ -220,7 +222,6 @@ impl Balloon { }, queue_evts, queues, - irq_trigger: IrqTrigger::new().map_err(BalloonError::EventFd)?, device_state: DeviceState::Inactive, activate_evt: EventFd::new(libc::EFD_NONBLOCK).map_err(BalloonError::EventFd)?, restored_from_file, @@ -260,7 +261,7 @@ impl Balloon { pub(crate) fn process_inflate(&mut self) -> Result<(), BalloonError> { // This is safe since we checked in the event handler that the device is activated. - let mem = self.device_state.mem().unwrap(); + let mem = &self.device_state.active_state().unwrap().mem; METRICS.inflate_count.inc(); let queue = &mut self.queues[INFLATE_INDEX]; @@ -342,7 +343,7 @@ impl Balloon { queue.advance_used_ring_idx(); if needs_interrupt { - self.signal_used_queue()?; + self.signal_used_queue(INFLATE_INDEX)?; } Ok(()) @@ -361,7 +362,7 @@ impl Balloon { queue.advance_used_ring_idx(); if needs_interrupt { - self.signal_used_queue() + self.signal_used_queue(DEFLATE_INDEX) } else { Ok(()) } @@ -369,7 +370,7 @@ impl Balloon { pub(crate) fn process_stats_queue(&mut self) -> Result<(), BalloonError> { // This is safe since we checked in the event handler that the device is activated. - let mem = self.device_state.mem().unwrap(); + let mem = &self.device_state.active_state().unwrap().mem; METRICS.stats_updates_count.inc(); while let Some(head) = self.queues[STATS_INDEX].pop()? { @@ -403,11 +404,16 @@ impl Balloon { Ok(()) } - pub(crate) fn signal_used_queue(&self) -> Result<(), BalloonError> { - self.irq_trigger.trigger_irq(IrqType::Vring).map_err(|err| { - METRICS.event_fails.inc(); - BalloonError::InterruptError(err) - }) + pub(crate) fn signal_used_queue(&self, qidx: usize) -> Result<(), BalloonError> { + self.interrupt_trigger() + .trigger(VirtioInterruptType::Queue( + qidx.try_into() + .unwrap_or_else(|_| panic!("balloon: invalid queue id: {qidx}")), + )) + .map_err(|err| { + METRICS.event_fails.inc(); + BalloonError::InterruptError(err) + }) } /// Process device virtio queue(s). @@ -433,7 +439,7 @@ impl Balloon { if let Some(index) = self.stats_desc_index.take() { self.queues[STATS_INDEX].add_used(index, 0)?; self.queues[STATS_INDEX].advance_used_ring_idx(); - self.signal_used_queue() + self.signal_used_queue(STATS_INDEX) } else { error!("Failed to update balloon stats, missing descriptor."); Ok(()) @@ -444,8 +450,8 @@ impl Balloon { pub fn update_size(&mut self, amount_mib: u32) -> Result<(), BalloonError> { if self.is_activated() { self.config_space.num_pages = mib_to_pages(amount_mib)?; - self.irq_trigger - .trigger_irq(IrqType::Config) + self.interrupt_trigger() + .trigger(VirtioInterruptType::Config) .map_err(BalloonError::InterruptError) } else { Err(BalloonError::DeviceNotActive) @@ -556,8 +562,12 @@ impl VirtioDevice for Balloon { &self.queue_evts } - fn interrupt_trigger(&self) -> &IrqTrigger { - &self.irq_trigger + fn interrupt_trigger(&self) -> &dyn VirtioInterrupt { + self.device_state + .active_state() + .expect("Device is not activated") + .interrupt + .deref() } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -584,13 +594,17 @@ impl VirtioDevice for Balloon { dst.copy_from_slice(data); } - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate( + &mut self, + mem: GuestMemoryMmap, + interrupt: Arc, + ) -> Result<(), ActivateError> { for q in self.queues.iter_mut() { q.initialize(&mem) .map_err(ActivateError::QueueMemoryError)?; } - self.device_state = DeviceState::Activated(mem); + self.device_state = DeviceState::Activated(ActiveState { mem, interrupt }); if self.activate_evt.write(1).is_err() { METRICS.activate_fails.inc(); self.device_state = DeviceState::Inactive; @@ -607,6 +621,16 @@ impl VirtioDevice for Balloon { fn is_activated(&self) -> bool { self.device_state.is_activated() } + + fn kick(&mut self) { + // If device is activated, kick the balloon queue(s) to make up for any + // pending or in-flight epoll events we may have not captured in snapshot. + // Stats queue doesn't need kicking as it is notified via a `timer_fd`. + if self.is_activated() { + info!("kick balloon {}.", self.id()); + self.process_virtio_queues(); + } + } } #[cfg(test)] @@ -619,7 +643,7 @@ pub(crate) mod tests { check_request_completion, invoke_handler_for_queue_event, set_request, }; use crate::devices::virtio::queue::{VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; - use crate::devices::virtio::test_utils::{VirtQueue, default_mem}; + use crate::devices::virtio::test_utils::{VirtQueue, default_interrupt, default_mem}; use crate::test_utils::single_region_mem; use crate::vstate::memory::GuestAddress; @@ -796,11 +820,12 @@ pub(crate) mod tests { fn test_invalid_request() { let mut balloon = Balloon::new(0, true, 0, false).unwrap(); let mem = default_mem(); + let interrupt = default_interrupt(); // Only initialize the inflate queue to demonstrate invalid request handling. let infq = VirtQueue::new(GuestAddress(0), &mem, 16); balloon.set_queue(INFLATE_INDEX, infq.create_queue()); balloon.set_queue(DEFLATE_INDEX, infq.create_queue()); - balloon.activate(mem.clone()).unwrap(); + balloon.activate(mem.clone(), interrupt).unwrap(); // Fill the second page with non-zero bytes. for i in 0..0x1000 { @@ -856,10 +881,11 @@ pub(crate) mod tests { fn test_inflate() { let mut balloon = Balloon::new(0, true, 0, false).unwrap(); let mem = default_mem(); + let interrupt = default_interrupt(); let infq = VirtQueue::new(GuestAddress(0), &mem, 16); balloon.set_queue(INFLATE_INDEX, infq.create_queue()); balloon.set_queue(DEFLATE_INDEX, infq.create_queue()); - balloon.activate(mem.clone()).unwrap(); + balloon.activate(mem.clone(), interrupt).unwrap(); // Fill the third page with non-zero bytes. for i in 0..0x1000 { @@ -927,10 +953,11 @@ pub(crate) mod tests { fn test_deflate() { let mut balloon = Balloon::new(0, true, 0, false).unwrap(); let mem = default_mem(); + let interrupt = default_interrupt(); let defq = VirtQueue::new(GuestAddress(0), &mem, 16); balloon.set_queue(INFLATE_INDEX, defq.create_queue()); balloon.set_queue(DEFLATE_INDEX, defq.create_queue()); - balloon.activate(mem.clone()).unwrap(); + balloon.activate(mem.clone(), interrupt).unwrap(); let page_addr = 0x10; @@ -976,11 +1003,12 @@ pub(crate) mod tests { fn test_stats() { let mut balloon = Balloon::new(0, true, 1, false).unwrap(); let mem = default_mem(); + let interrupt = default_interrupt(); let statsq = VirtQueue::new(GuestAddress(0), &mem, 16); balloon.set_queue(INFLATE_INDEX, statsq.create_queue()); balloon.set_queue(DEFLATE_INDEX, statsq.create_queue()); balloon.set_queue(STATS_INDEX, statsq.create_queue()); - balloon.activate(mem.clone()).unwrap(); + balloon.activate(mem.clone(), interrupt).unwrap(); let page_addr = 0x100; @@ -1056,7 +1084,9 @@ pub(crate) mod tests { assert!(balloon.stats_desc_index.is_some()); balloon.process_stats_timer_event().unwrap(); assert!(balloon.stats_desc_index.is_none()); - assert!(balloon.irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(balloon.interrupt_trigger().has_pending_interrupt( + VirtioInterruptType::Queue(STATS_INDEX.try_into().unwrap()) + )); }); } } @@ -1065,13 +1095,14 @@ pub(crate) mod tests { fn test_process_balloon_queues() { let mut balloon = Balloon::new(0x10, true, 0, false).unwrap(); let mem = default_mem(); + let interrupt = default_interrupt(); let infq = VirtQueue::new(GuestAddress(0), &mem, 16); let defq = VirtQueue::new(GuestAddress(0), &mem, 16); balloon.set_queue(INFLATE_INDEX, infq.create_queue()); balloon.set_queue(DEFLATE_INDEX, defq.create_queue()); - balloon.activate(mem).unwrap(); + balloon.activate(mem, interrupt).unwrap(); balloon.process_virtio_queues().unwrap(); } @@ -1082,7 +1113,8 @@ pub(crate) mod tests { let q = VirtQueue::new(GuestAddress(0), &mem, 16); balloon.set_queue(INFLATE_INDEX, q.create_queue()); balloon.set_queue(DEFLATE_INDEX, q.create_queue()); - balloon.activate(mem).unwrap(); + let interrupt = default_interrupt(); + balloon.activate(mem, interrupt).unwrap(); assert_eq!( format!("{:?}", balloon.update_stats_polling_interval(1)), "Err(StatisticsStateChange)" @@ -1095,7 +1127,8 @@ pub(crate) mod tests { balloon.set_queue(INFLATE_INDEX, q.create_queue()); balloon.set_queue(DEFLATE_INDEX, q.create_queue()); balloon.set_queue(STATS_INDEX, q.create_queue()); - balloon.activate(mem).unwrap(); + let interrupt = default_interrupt(); + balloon.activate(mem, interrupt).unwrap(); assert_eq!( format!("{:?}", balloon.update_stats_polling_interval(0)), "Err(StatisticsStateChange)" @@ -1115,7 +1148,10 @@ pub(crate) mod tests { fn test_num_pages() { let mut balloon = Balloon::new(0, true, 0, false).unwrap(); // Switch the state to active. - balloon.device_state = DeviceState::Activated(single_region_mem(0x1)); + balloon.device_state = DeviceState::Activated(ActiveState { + mem: single_region_mem(0x1), + interrupt: default_interrupt(), + }); assert_eq!(balloon.num_pages(), 0); assert_eq!(balloon.actual_pages(), 0); diff --git a/src/vmm/src/devices/virtio/balloon/event_handler.rs b/src/vmm/src/devices/virtio/balloon/event_handler.rs index 4e311edc045..3922b4b8385 100644 --- a/src/vmm/src/devices/virtio/balloon/event_handler.rs +++ b/src/vmm/src/devices/virtio/balloon/event_handler.rs @@ -136,7 +136,7 @@ pub mod tests { use super::*; use crate::devices::virtio::balloon::test_utils::set_request; - use crate::devices::virtio::test_utils::{VirtQueue, default_mem}; + use crate::devices::virtio::test_utils::{VirtQueue, default_interrupt, default_mem}; use crate::vstate::memory::GuestAddress; #[test] @@ -144,6 +144,7 @@ pub mod tests { let mut event_manager = EventManager::new().unwrap(); let mut balloon = Balloon::new(0, true, 10, false).unwrap(); let mem = default_mem(); + let interrupt = default_interrupt(); let infq = VirtQueue::new(GuestAddress(0), &mem, 16); balloon.set_queue(INFLATE_INDEX, infq.create_queue()); balloon.set_queue(DEFLATE_INDEX, infq.create_queue()); @@ -179,7 +180,11 @@ pub mod tests { } // Now activate the device. - balloon.lock().unwrap().activate(mem.clone()).unwrap(); + balloon + .lock() + .unwrap() + .activate(mem.clone(), interrupt) + .unwrap(); // Process the activate event. let ev_count = event_manager.run_with_timeout(50).unwrap(); assert_eq!(ev_count, 1); diff --git a/src/vmm/src/devices/virtio/balloon/mod.rs b/src/vmm/src/devices/virtio/balloon/mod.rs index 5af1e17288a..3f3e9346545 100644 --- a/src/vmm/src/devices/virtio/balloon/mod.rs +++ b/src/vmm/src/devices/virtio/balloon/mod.rs @@ -81,7 +81,7 @@ pub enum BalloonError { MalformedPayload, /// Error restoring the balloon device queues. QueueRestoreError, - /// Received stats querry when stats are disabled. + /// Received stats query when stats are disabled. StatisticsDisabled, /// Statistics cannot be enabled/disabled after activation. StatisticsStateChange, diff --git a/src/vmm/src/devices/virtio/balloon/persist.rs b/src/vmm/src/devices/virtio/balloon/persist.rs index 004fa27f8ca..15ae1e26b9e 100644 --- a/src/vmm/src/devices/virtio/balloon/persist.rs +++ b/src/vmm/src/devices/virtio/balloon/persist.rs @@ -4,7 +4,6 @@ //! Defines the structures needed for saving/restoring balloon devices. use std::sync::Arc; -use std::sync::atomic::AtomicU32; use std::time::Duration; use serde::{Deserialize, Serialize}; @@ -13,9 +12,10 @@ use timerfd::{SetTimeFlags, TimerState}; use super::*; use crate::devices::virtio::TYPE_BALLOON; use crate::devices::virtio::balloon::device::{BalloonStats, ConfigSpace}; -use crate::devices::virtio::device::DeviceState; +use crate::devices::virtio::device::{ActiveState, DeviceState}; use crate::devices::virtio::persist::VirtioDeviceState; use crate::devices::virtio::queue::FIRECRACKER_MAX_QUEUE_SIZE; +use crate::devices::virtio::transport::VirtioInterrupt; use crate::snapshot::Persist; use crate::vstate::memory::GuestMemoryMmap; @@ -87,7 +87,7 @@ pub struct BalloonState { stats_desc_index: Option, latest_stats: BalloonStatsState, config_space: BalloonConfigSpaceState, - virtio_state: VirtioDeviceState, + pub virtio_state: VirtioDeviceState, } /// Auxiliary structure for creating a device when resuming from a snapshot. @@ -144,8 +144,6 @@ impl Persist<'_> for Balloon { FIRECRACKER_MAX_QUEUE_SIZE, ) .map_err(|_| Self::Error::QueueRestoreError)?; - balloon.irq_trigger.irq_status = - Arc::new(AtomicU32::new(state.virtio_state.interrupt_status)); balloon.avail_features = state.virtio_state.avail_features; balloon.acked_features = state.virtio_state.acked_features; balloon.latest_stats = state.latest_stats.create_stats(); @@ -154,22 +152,18 @@ impl Persist<'_> for Balloon { actual_pages: state.config_space.actual_pages, }; - if state.virtio_state.activated { - balloon.device_state = DeviceState::Activated(constructor_args.mem); - - if balloon.stats_enabled() { - // Restore the stats descriptor. - balloon.set_stats_desc_index(state.stats_desc_index); - - // Restart timer if needed. - let timer_state = TimerState::Periodic { - current: Duration::from_secs(u64::from(state.stats_polling_interval_s)), - interval: Duration::from_secs(u64::from(state.stats_polling_interval_s)), - }; - balloon - .stats_timer - .set_state(timer_state, SetTimeFlags::Default); - } + if state.virtio_state.activated && balloon.stats_enabled() { + // Restore the stats descriptor. + balloon.set_stats_desc_index(state.stats_desc_index); + + // Restart timer if needed. + let timer_state = TimerState::Periodic { + current: Duration::from_secs(u64::from(state.stats_polling_interval_s)), + interval: Duration::from_secs(u64::from(state.stats_polling_interval_s)), + }; + balloon + .stats_timer + .set_state(timer_state, SetTimeFlags::Default); } Ok(balloon) @@ -178,12 +172,11 @@ impl Persist<'_> for Balloon { #[cfg(test)] mod tests { - use std::sync::atomic::Ordering; use super::*; use crate::devices::virtio::TYPE_BALLOON; use crate::devices::virtio::device::VirtioDevice; - use crate::devices::virtio::test_utils::default_mem; + use crate::devices::virtio::test_utils::{default_interrupt, default_mem}; use crate::snapshot::Snapshot; #[test] @@ -213,11 +206,8 @@ mod tests { assert_eq!(restored_balloon.avail_features, balloon.avail_features); assert_eq!(restored_balloon.config_space, balloon.config_space); assert_eq!(restored_balloon.queues(), balloon.queues()); - assert_eq!( - restored_balloon.interrupt_status().load(Ordering::Relaxed), - balloon.interrupt_status().load(Ordering::Relaxed) - ); - assert_eq!(restored_balloon.is_activated(), balloon.is_activated()); + assert!(!restored_balloon.is_activated()); + assert!(!balloon.is_activated()); assert_eq!( restored_balloon.stats_polling_interval_s, diff --git a/src/vmm/src/devices/virtio/balloon/test_utils.rs b/src/vmm/src/devices/virtio/balloon/test_utils.rs index af0d7f5845e..2665d5dbd87 100644 --- a/src/vmm/src/devices/virtio/balloon/test_utils.rs +++ b/src/vmm/src/devices/virtio/balloon/test_utils.rs @@ -3,6 +3,8 @@ #![doc(hidden)] +#[cfg(test)] +use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::test_utils::VirtQueue; #[cfg(test)] use crate::devices::virtio::{balloon::BALLOON_NUM_QUEUES, balloon::Balloon}; @@ -10,7 +12,7 @@ use crate::devices::virtio::{balloon::BALLOON_NUM_QUEUES, balloon::Balloon}; #[cfg(test)] pub fn invoke_handler_for_queue_event(b: &mut Balloon, queue_index: usize) { use crate::devices::virtio::balloon::{DEFLATE_INDEX, INFLATE_INDEX, STATS_INDEX}; - use crate::devices::virtio::device::IrqType; + use crate::devices::virtio::transport::VirtioInterruptType; assert!(queue_index < BALLOON_NUM_QUEUES); // Trigger the queue event. @@ -23,7 +25,11 @@ pub fn invoke_handler_for_queue_event(b: &mut Balloon, queue_index: usize) { _ => unreachable!(), }; // Validate the queue operation finished successfully. - assert!(b.irq_trigger.has_pending_irq(IrqType::Vring)); + let interrupt = b.interrupt_trigger(); + assert!( + interrupt + .has_pending_interrupt(VirtioInterruptType::Queue(queue_index.try_into().unwrap())) + ); } pub fn set_request(queue: &VirtQueue, idx: u16, addr: u64, len: u32, flags: u16) { diff --git a/src/vmm/src/devices/virtio/block/device.rs b/src/vmm/src/devices/virtio/block/device.rs index 5d41eb04078..c1fa95f7b1c 100644 --- a/src/vmm/src/devices/virtio/block/device.rs +++ b/src/vmm/src/devices/virtio/block/device.rs @@ -1,15 +1,19 @@ // Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +use std::sync::Arc; + use event_manager::{EventOps, Events, MutEventSubscriber}; +use log::info; use vmm_sys_util::eventfd::EventFd; use super::BlockError; use super::persist::{BlockConstructorArgs, BlockState}; use super::vhost_user::device::{VhostUserBlock, VhostUserBlockConfig}; use super::virtio::device::{VirtioBlock, VirtioBlockConfig}; -use crate::devices::virtio::device::{IrqTrigger, VirtioDevice}; +use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::queue::{InvalidAvailIdx, Queue}; +use crate::devices::virtio::transport::VirtioInterrupt; use crate::devices::virtio::{ActivateError, TYPE_BLOCK}; use crate::rate_limiter::BucketUpdate; use crate::snapshot::Persist; @@ -173,10 +177,10 @@ impl VirtioDevice for Block { } } - fn interrupt_trigger(&self) -> &IrqTrigger { + fn interrupt_trigger(&self) -> &dyn VirtioInterrupt { match self { - Self::Virtio(b) => &b.irq_trigger, - Self::VhostUser(b) => &b.irq_trigger, + Self::Virtio(b) => b.interrupt_trigger(), + Self::VhostUser(b) => b.interrupt_trigger(), } } @@ -194,10 +198,14 @@ impl VirtioDevice for Block { } } - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate( + &mut self, + mem: GuestMemoryMmap, + interrupt: Arc, + ) -> Result<(), ActivateError> { match self { - Self::Virtio(b) => b.activate(mem), - Self::VhostUser(b) => b.activate(mem), + Self::Virtio(b) => b.activate(mem, interrupt), + Self::VhostUser(b) => b.activate(mem, interrupt), } } @@ -207,6 +215,18 @@ impl VirtioDevice for Block { Self::VhostUser(b) => b.device_state.is_activated(), } } + + fn kick(&mut self) { + // If device is activated, kick the block queue(s) to make up for any + // pending or in-flight epoll events we may have not captured in + // snapshot. No need to kick Ratelimiters + // because they are restored 'unblocked' so + // any inflight `timer_fd` events can be safely discarded. + if self.is_activated() { + info!("kick block {}.", self.id()); + self.process_virtio_queues(); + } + } } impl MutEventSubscriber for Block { diff --git a/src/vmm/src/devices/virtio/block/persist.rs b/src/vmm/src/devices/virtio/block/persist.rs index 2d83c416d9f..cb9a6471137 100644 --- a/src/vmm/src/devices/virtio/block/persist.rs +++ b/src/vmm/src/devices/virtio/block/persist.rs @@ -1,10 +1,13 @@ // Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +use std::sync::Arc; + use serde::{Deserialize, Serialize}; use super::vhost_user::persist::VhostUserBlockState; use super::virtio::persist::VirtioBlockState; +use crate::devices::virtio::transport::VirtioInterrupt; use crate::vstate::memory::GuestMemoryMmap; /// Block device state. @@ -14,6 +17,15 @@ pub enum BlockState { VhostUser(VhostUserBlockState), } +impl BlockState { + pub fn is_activated(&self) -> bool { + match self { + BlockState::Virtio(virtio_block_state) => virtio_block_state.virtio_state.activated, + BlockState::VhostUser(vhost_user_block_state) => false, + } + } +} + /// Auxiliary structure for creating a device when resuming from a snapshot. #[derive(Debug)] pub struct BlockConstructorArgs { diff --git a/src/vmm/src/devices/virtio/block/vhost_user/device.rs b/src/vmm/src/devices/virtio/block/vhost_user/device.rs index a42a2fe0c46..1d6c2aac080 100644 --- a/src/vmm/src/devices/virtio/block/vhost_user/device.rs +++ b/src/vmm/src/devices/virtio/block/vhost_user/device.rs @@ -4,6 +4,7 @@ // Portions Copyright 2019 Intel Corporation. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +use std::ops::Deref; use std::sync::Arc; use log::error; @@ -14,11 +15,12 @@ use vmm_sys_util::eventfd::EventFd; use super::{NUM_QUEUES, QUEUE_SIZE, VhostUserBlockError}; use crate::devices::virtio::block::CacheType; -use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice}; +use crate::devices::virtio::device::{ActiveState, DeviceState, VirtioDevice}; use crate::devices::virtio::generated::virtio_blk::{VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_RO}; use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; use crate::devices::virtio::generated::virtio_ring::VIRTIO_RING_F_EVENT_IDX; use crate::devices::virtio::queue::Queue; +use crate::devices::virtio::transport::{VirtioInterrupt, VirtioInterruptType}; use crate::devices::virtio::vhost_user::{VhostUserHandleBackend, VhostUserHandleImpl}; use crate::devices::virtio::vhost_user_metrics::{ VhostUserDeviceMetrics, VhostUserMetricsPerDevice, @@ -34,7 +36,7 @@ const BLOCK_CONFIG_SPACE_SIZE: u32 = 60; const AVAILABLE_FEATURES: u64 = (1 << VIRTIO_F_VERSION_1) | (1 << VIRTIO_RING_F_EVENT_IDX) - // vhost-user specific bit. Not defined in standart virtio spec. + // vhost-user specific bit. Not defined in standard virtio spec. // Specifies ability of frontend to negotiate protocol features. | VhostUserVirtioFeatures::PROTOCOL_FEATURES.bits() // We always try to negotiate readonly with the backend. @@ -117,7 +119,6 @@ pub struct VhostUserBlockImpl { pub queues: Vec, pub queue_evts: [EventFd; u64_to_usize(NUM_QUEUES)], pub device_state: DeviceState, - pub irq_trigger: IrqTrigger, // Implementation specific fields. pub id: String, @@ -143,7 +144,6 @@ impl std::fmt::Debug for VhostUserBlockImpl { .field("queues", &self.queues) .field("queue_evts", &self.queue_evts) .field("device_state", &self.device_state) - .field("irq_trigger", &self.irq_trigger) .field("id", &self.id) .field("partuuid", &self.partuuid) .field("cache_type", &self.cache_type) @@ -203,7 +203,6 @@ impl VhostUserBlockImpl { let queue_evts = [EventFd::new(libc::EFD_NONBLOCK).map_err(VhostUserBlockError::EventFd)?; u64_to_usize(NUM_QUEUES)]; let device_state = DeviceState::Inactive; - let irq_trigger = IrqTrigger::new().map_err(VhostUserBlockError::IrqTrigger)?; // We negotiated features with backend. Now these acked_features // are available for guest driver to choose from. @@ -225,7 +224,6 @@ impl VhostUserBlockImpl { queues, queue_evts, device_state, - irq_trigger, id: config.drive_id, partuuid: config.partuuid, @@ -256,6 +254,12 @@ impl VhostUserBlockImpl { pub fn config_update(&mut self) -> Result<(), VhostUserBlockError> { let start_time = get_time_us(ClockType::Monotonic); + let interrupt = self + .device_state + .active_state() + .expect("Device is not initialized") + .interrupt + .clone(); // This buffer is used for config size check in vhost crate. let buffer = [0u8; BLOCK_CONFIG_SPACE_SIZE as usize]; @@ -270,9 +274,9 @@ impl VhostUserBlockImpl { ) .map_err(VhostUserBlockError::Vhost)?; self.config_space = new_config_space; - self.irq_trigger - .trigger_irq(IrqType::Config) - .map_err(VhostUserBlockError::IrqTrigger)?; + interrupt + .trigger(VirtioInterruptType::Config) + .map_err(VhostUserBlockError::Interrupt)?; let delta_us = get_time_us(ClockType::Monotonic) - start_time; self.metrics.config_change_time_us.store(delta_us); @@ -310,8 +314,12 @@ impl VirtioDevice for VhostUserBlock &self.queue_evts } - fn interrupt_trigger(&self) -> &IrqTrigger { - &self.irq_trigger + fn interrupt_trigger(&self) -> &dyn VirtioInterrupt { + self.device_state + .active_state() + .expect("Device is not initialized") + .interrupt + .deref() } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -330,7 +338,11 @@ impl VirtioDevice for VhostUserBlock // Other block config fields are immutable. } - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate( + &mut self, + mem: GuestMemoryMmap, + interrupt: Arc, + ) -> Result<(), ActivateError> { for q in self.queues.iter_mut() { q.initialize(&mem) .map_err(ActivateError::QueueMemoryError)?; @@ -345,14 +357,14 @@ impl VirtioDevice for VhostUserBlock self.vu_handle.setup_backend( &mem, &[(0, &self.queues[0], &self.queue_evts[0])], - &self.irq_trigger, + interrupt.clone(), ) }) .map_err(|err| { self.metrics.activate_fails.inc(); ActivateError::VhostUser(err) })?; - self.device_state = DeviceState::Activated(mem); + self.device_state = DeviceState::Activated(ActiveState { mem, interrupt }); let delta_us = get_time_us(ClockType::Monotonic) - start_time; self.metrics.activate_time_us.store(delta_us); Ok(()) @@ -375,8 +387,8 @@ mod tests { use super::*; use crate::devices::virtio::block::virtio::device::FileEngineType; - use crate::devices::virtio::mmio::VIRTIO_MMIO_INT_CONFIG; - use crate::devices::virtio::test_utils::VirtQueue; + use crate::devices::virtio::test_utils::{VirtQueue, default_interrupt, default_mem}; + use crate::devices::virtio::transport::mmio::VIRTIO_MMIO_INT_CONFIG; use crate::devices::virtio::vhost_user::tests::create_mem; use crate::test_utils::create_tmp_socket; use crate::vstate::memory::GuestAddress; @@ -652,6 +664,10 @@ mod tests { assert_eq!(vhost_block.config_space, vec![0x69, 0x69, 0x69]); // Testing [`config_update`] + vhost_block.device_state = DeviceState::Activated(ActiveState { + mem: default_mem(), + interrupt: default_interrupt(), + }); vhost_block.config_space = vec![]; vhost_block.config_update().unwrap(); assert_eq!(vhost_block.config_space, vec![0x69, 0x69, 0x69]); @@ -783,9 +799,10 @@ mod tests { let guest_memory = create_mem(file, ®ions); let q = VirtQueue::new(GuestAddress(0), &guest_memory, 16); vhost_block.queues[0] = q.create_queue(); + let interrupt = default_interrupt(); // During actiavion of the device features, memory and queues should be set and activated. - vhost_block.activate(guest_memory).unwrap(); + vhost_block.activate(guest_memory, interrupt).unwrap(); assert!(unsafe { *vhost_block.vu_handle.vu.features_are_set.get() }); assert!(unsafe { *vhost_block.vu_handle.vu.memory_is_set.get() }); assert!(unsafe { *vhost_block.vu_handle.vu.vring_enabled.get() }); diff --git a/src/vmm/src/devices/virtio/block/vhost_user/mod.rs b/src/vmm/src/devices/virtio/block/vhost_user/mod.rs index 8d4d9f44261..0afaaed3400 100644 --- a/src/vmm/src/devices/virtio/block/vhost_user/mod.rs +++ b/src/vmm/src/devices/virtio/block/vhost_user/mod.rs @@ -28,5 +28,5 @@ pub enum VhostUserBlockError { /// Error opening eventfd: {0} EventFd(std::io::Error), /// Error creating irqfd: {0} - IrqTrigger(std::io::Error), + Interrupt(std::io::Error), } diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs index 2f5d88114b6..d04fd5674ea 100644 --- a/src/vmm/src/devices/virtio/block/virtio/device.rs +++ b/src/vmm/src/devices/virtio/block/virtio/device.rs @@ -9,6 +9,7 @@ use std::cmp; use std::convert::From; use std::fs::{File, OpenOptions}; use std::io::{Seek, SeekFrom}; +use std::ops::Deref; use std::os::linux::fs::MetadataExt; use std::path::PathBuf; use std::sync::Arc; @@ -23,13 +24,14 @@ use super::request::*; use super::{BLOCK_QUEUE_SIZES, SECTOR_SHIFT, SECTOR_SIZE, VirtioBlockError, io as block_io}; use crate::devices::virtio::block::CacheType; use crate::devices::virtio::block::virtio::metrics::{BlockDeviceMetrics, BlockMetricsPerDevice}; -use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice}; +use crate::devices::virtio::device::{ActiveState, DeviceState, VirtioDevice}; use crate::devices::virtio::generated::virtio_blk::{ VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_RO, VIRTIO_BLK_ID_BYTES, }; use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; use crate::devices::virtio::generated::virtio_ring::VIRTIO_RING_F_EVENT_IDX; use crate::devices::virtio::queue::{InvalidAvailIdx, Queue}; +use crate::devices::virtio::transport::{VirtioInterrupt, VirtioInterruptType}; use crate::devices::virtio::{ActivateError, TYPE_BLOCK}; use crate::logger::{IncMetric, error, warn}; use crate::rate_limiter::{BucketUpdate, RateLimiter}; @@ -249,7 +251,6 @@ pub struct VirtioBlock { pub queues: Vec, pub queue_evts: [EventFd; 1], pub device_state: DeviceState, - pub irq_trigger: IrqTrigger, // Implementation specific fields. pub id: String, @@ -322,7 +323,6 @@ impl VirtioBlock { queues, queue_evts, device_state: DeviceState::Inactive, - irq_trigger: IrqTrigger::new().map_err(VirtioBlockError::IrqTrigger)?, id: config.drive_id.clone(), partuuid: config.partuuid, @@ -387,34 +387,40 @@ impl VirtioBlock { /// Device specific function for peaking inside a queue and processing descriptors. pub fn process_queue(&mut self, queue_index: usize) -> Result<(), InvalidAvailIdx> { // This is safe since we checked in the event handler that the device is activated. - let mem = self.device_state.mem().unwrap(); + let active_state = self.device_state.active_state().unwrap(); let queue = &mut self.queues[queue_index]; let mut used_any = false; while let Some(head) = queue.pop_or_enable_notification()? { self.metrics.remaining_reqs_count.add(queue.len().into()); - let processing_result = match Request::parse(&head, mem, self.disk.nsectors) { - Ok(request) => { - if request.rate_limit(&mut self.rate_limiter) { - // Stop processing the queue and return this descriptor chain to the - // avail ring, for later processing. - queue.undo_pop(); - self.metrics.rate_limiter_throttled_events.inc(); - break; + let processing_result = + match Request::parse(&head, &active_state.mem, self.disk.nsectors) { + Ok(request) => { + if request.rate_limit(&mut self.rate_limiter) { + // Stop processing the queue and return this descriptor chain to the + // avail ring, for later processing. + queue.undo_pop(); + self.metrics.rate_limiter_throttled_events.inc(); + break; + } + + request.process( + &mut self.disk, + head.index, + &active_state.mem, + &self.metrics, + ) } - - request.process(&mut self.disk, head.index, mem, &self.metrics) - } - Err(err) => { - error!("Failed to parse available descriptor chain: {:?}", err); - self.metrics.execute_fails.inc(); - ProcessingResult::Executed(FinishedRequest { - num_bytes_to_mem: 0, - desc_idx: head.index, - }) - } - }; + Err(err) => { + error!("Failed to parse available descriptor chain: {:?}", err); + self.metrics.execute_fails.inc(); + ProcessingResult::Executed(FinishedRequest { + num_bytes_to_mem: 0, + desc_idx: head.index, + }) + } + }; match processing_result { ProcessingResult::Submitted => {} @@ -439,8 +445,9 @@ impl VirtioBlock { queue.advance_used_ring_idx(); if used_any && queue.prepare_kick() { - self.irq_trigger - .trigger_irq(IrqType::Vring) + active_state + .interrupt + .trigger(VirtioInterruptType::Queue(0)) .unwrap_or_else(|_| { self.metrics.event_fails.inc(); }); @@ -463,11 +470,11 @@ impl VirtioBlock { let engine = unwrap_async_file_engine_or_return!(&mut self.disk.file_engine); // This is safe since we checked in the event handler that the device is activated. - let mem = self.device_state.mem().unwrap(); + let active_state = self.device_state.active_state().unwrap(); let queue = &mut self.queues[0]; loop { - match engine.pop(mem) { + match engine.pop(&active_state.mem) { Err(error) => { error!("Failed to read completed io_uring entry: {:?}", error); break; @@ -486,7 +493,7 @@ impl VirtioBlock { ))), ), }; - let finished = pending.finish(mem, res, &self.metrics); + let finished = pending.finish(&active_state.mem, res, &self.metrics); queue .add_used(finished.desc_idx, finished.num_bytes_to_mem) .unwrap_or_else(|err| { @@ -501,8 +508,9 @@ impl VirtioBlock { queue.advance_used_ring_idx(); if queue.prepare_kick() { - self.irq_trigger - .trigger_irq(IrqType::Vring) + active_state + .interrupt + .trigger(VirtioInterruptType::Queue(0)) .unwrap_or_else(|_| { self.metrics.event_fails.inc(); }); @@ -529,8 +537,12 @@ impl VirtioBlock { self.disk.update(disk_image_path, self.read_only)?; self.config_space.capacity = self.disk.nsectors.to_le(); // virtio_block_config_space(); - // Kick the driver to pick up the changes. - self.irq_trigger.trigger_irq(IrqType::Config).unwrap(); + // Kick the driver to pick up the changes. (But only if the device is already activated). + if self.is_activated() { + self.interrupt_trigger() + .trigger(VirtioInterruptType::Config) + .unwrap(); + } self.metrics.update_count.inc(); Ok(()) @@ -597,8 +609,12 @@ impl VirtioDevice for VirtioBlock { &self.queue_evts } - fn interrupt_trigger(&self) -> &IrqTrigger { - &self.irq_trigger + fn interrupt_trigger(&self) -> &dyn VirtioInterrupt { + self.device_state + .active_state() + .expect("Device is not initialized") + .interrupt + .deref() } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -627,7 +643,11 @@ impl VirtioDevice for VirtioBlock { dst.copy_from_slice(data); } - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate( + &mut self, + mem: GuestMemoryMmap, + interrupt: Arc, + ) -> Result<(), ActivateError> { for q in self.queues.iter_mut() { q.initialize(&mem) .map_err(ActivateError::QueueMemoryError)?; @@ -644,7 +664,7 @@ impl VirtioDevice for VirtioBlock { self.metrics.activate_fails.inc(); return Err(ActivateError::EventFd); } - self.device_state = DeviceState::Activated(mem); + self.device_state = DeviceState::Activated(ActiveState { mem, interrupt }); Ok(()) } @@ -687,7 +707,7 @@ mod tests { simulate_queue_event, }; use crate::devices::virtio::queue::{VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; - use crate::devices::virtio::test_utils::{VirtQueue, default_mem}; + use crate::devices::virtio::test_utils::{VirtQueue, default_interrupt, default_mem}; use crate::rate_limiter::TokenType; use crate::vstate::memory::{Address, Bytes, GuestAddress}; @@ -829,7 +849,7 @@ mod tests { block.read_config(0, actual_config_space.as_mut_slice()); assert_eq!(actual_config_space, expected_config_space); - // If priviledged user writes to `/dev/mem`, in block config space - byte by byte. + // If privileged user writes to `/dev/mem`, in block config space - byte by byte. let expected_config_space = ConfigSpace { capacity: 0x1122334455667788, }; @@ -862,9 +882,10 @@ mod tests { for engine in [FileEngineType::Sync, FileEngineType::Async] { let mut block = default_block(engine); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -890,9 +911,10 @@ mod tests { let mut block = default_block(engine); // Default mem size is 0x10000 let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -953,9 +975,10 @@ mod tests { for engine in [FileEngineType::Sync, FileEngineType::Async] { let mut block = default_block(engine); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -1004,9 +1027,10 @@ mod tests { for engine in [FileEngineType::Sync, FileEngineType::Async] { let mut block = default_block(engine); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -1036,9 +1060,10 @@ mod tests { for engine in [FileEngineType::Sync, FileEngineType::Async] { let mut block = default_block(engine); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); read_blk_req_descriptors(&vq); vq.dtable[1].set(0xf000, 0x1000, VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE, 2); @@ -1072,9 +1097,10 @@ mod tests { for engine in [FileEngineType::Sync, FileEngineType::Async] { let mut block = default_block(engine); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -1119,9 +1145,10 @@ mod tests { // Default mem size is 0x10000 let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -1358,9 +1385,10 @@ mod tests { { // Default mem size is 0x10000 let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); read_blk_req_descriptors(&vq); vq.dtable[1].set(0xff00, 0x1000, VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE, 2); @@ -1399,9 +1427,10 @@ mod tests { for engine in [FileEngineType::Sync, FileEngineType::Async] { let mut block = default_block(engine); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -1445,9 +1474,10 @@ mod tests { for engine in [FileEngineType::Sync, FileEngineType::Async] { let mut block = default_block(engine); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -1569,9 +1599,10 @@ mod tests { let mut block = default_block(FileEngineType::Async); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, IO_URING_NUM_ENTRIES * 4); block.queues[0] = vq.create_queue(); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); // Run scenario that doesn't trigger FullSq BlockError: Add sq_size flush requests. add_flush_requests_batch(&mut block, &vq, IO_URING_NUM_ENTRIES); @@ -1603,9 +1634,10 @@ mod tests { let mut block = default_block(FileEngineType::Async); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, IO_URING_NUM_ENTRIES * 4); block.queues[0] = vq.create_queue(); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); // Run scenario that triggers FullCqError. Push 2 * IO_URING_NUM_ENTRIES and wait for // completion. Then try to push another entry. @@ -1633,9 +1665,10 @@ mod tests { let mut block = default_block(engine); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); block.queues[0] = vq.create_queue(); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); // Add a batch of flush requests. add_flush_requests_batch(&mut block, &vq, 5); @@ -1652,9 +1685,10 @@ mod tests { for engine in [FileEngineType::Sync, FileEngineType::Async] { let mut block = default_block(engine); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -1721,9 +1755,10 @@ mod tests { for engine in [FileEngineType::Sync, FileEngineType::Async] { let mut block = default_block(engine); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -1803,6 +1838,11 @@ mod tests { fn test_update_disk_image() { for engine in [FileEngineType::Sync, FileEngineType::Async] { let mut block = default_block(engine); + let mem = default_mem(); + let interrupt = default_interrupt(); + let vq = VirtQueue::new(GuestAddress(0), &mem, 16); + set_queue(&mut block, 0, vq.create_queue()); + block.activate(mem, interrupt).unwrap(); let f = TempFile::new().unwrap(); let path = f.as_path(); let mdata = metadata(path).unwrap(); diff --git a/src/vmm/src/devices/virtio/block/virtio/event_handler.rs b/src/vmm/src/devices/virtio/block/virtio/event_handler.rs index db69e23d7f0..03c09a01972 100644 --- a/src/vmm/src/devices/virtio/block/virtio/event_handler.rs +++ b/src/vmm/src/devices/virtio/block/virtio/event_handler.rs @@ -124,7 +124,7 @@ mod tests { }; use crate::devices::virtio::block::virtio::{VIRTIO_BLK_S_OK, VIRTIO_BLK_T_OUT}; use crate::devices::virtio::queue::VIRTQ_DESC_F_NEXT; - use crate::devices::virtio::test_utils::{VirtQueue, default_mem}; + use crate::devices::virtio::test_utils::{VirtQueue, default_interrupt, default_mem}; use crate::vstate::memory::{Bytes, GuestAddress}; #[test] @@ -132,6 +132,7 @@ mod tests { let mut event_manager = EventManager::new().unwrap(); let mut block = default_block(FileEngineType::default()); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); read_blk_req_descriptors(&vq); @@ -162,7 +163,11 @@ mod tests { assert_eq!(ev_count, 0); // Now activate the device. - block.lock().unwrap().activate(mem.clone()).unwrap(); + block + .lock() + .unwrap() + .activate(mem.clone(), interrupt) + .unwrap(); // Process the activate event. let ev_count = event_manager.run_with_timeout(50).unwrap(); assert_eq!(ev_count, 1); diff --git a/src/vmm/src/devices/virtio/block/virtio/mod.rs b/src/vmm/src/devices/virtio/block/virtio/mod.rs index 8ea59a5aba4..9e97d6d3897 100644 --- a/src/vmm/src/devices/virtio/block/virtio/mod.rs +++ b/src/vmm/src/devices/virtio/block/virtio/mod.rs @@ -57,8 +57,8 @@ pub enum VirtioBlockError { BackingFile(std::io::Error, String), /// Error opening eventfd: {0} EventFd(std::io::Error), - /// Error creating an irqfd: {0} - IrqTrigger(std::io::Error), + /// Error creating an interrupt: {0} + Interrupt(std::io::Error), /// Error coming from the rate limiter: {0} RateLimiter(std::io::Error), /// Persistence error: {0} diff --git a/src/vmm/src/devices/virtio/block/virtio/persist.rs b/src/vmm/src/devices/virtio/block/virtio/persist.rs index 8c6f2c2453d..1c7a1bce106 100644 --- a/src/vmm/src/devices/virtio/block/virtio/persist.rs +++ b/src/vmm/src/devices/virtio/block/virtio/persist.rs @@ -3,9 +3,6 @@ //! Defines the structures needed for saving/restoring block devices. -use std::sync::Arc; -use std::sync::atomic::AtomicU32; - use device::ConfigSpace; use serde::{Deserialize, Serialize}; use vmm_sys_util::eventfd::EventFd; @@ -16,7 +13,7 @@ use crate::devices::virtio::TYPE_BLOCK; use crate::devices::virtio::block::persist::BlockConstructorArgs; use crate::devices::virtio::block::virtio::device::FileEngineType; use crate::devices::virtio::block::virtio::metrics::BlockMetricsPerDevice; -use crate::devices::virtio::device::{DeviceState, IrqTrigger}; +use crate::devices::virtio::device::{ActiveState, DeviceState}; use crate::devices::virtio::generated::virtio_blk::VIRTIO_BLK_F_RO; use crate::devices::virtio::persist::VirtioDeviceState; use crate::rate_limiter::RateLimiter; @@ -61,7 +58,7 @@ pub struct VirtioBlockState { cache_type: CacheType, root_device: bool, disk_path: String, - virtio_state: VirtioDeviceState, + pub virtio_state: VirtioDeviceState, rate_limiter_state: RateLimiterState, file_engine_type: FileEngineTypeState, } @@ -111,18 +108,9 @@ impl Persist<'_> for VirtioBlock { ) .map_err(VirtioBlockError::Persist)?; - let mut irq_trigger = IrqTrigger::new().map_err(VirtioBlockError::IrqTrigger)?; - irq_trigger.irq_status = Arc::new(AtomicU32::new(state.virtio_state.interrupt_status)); - let avail_features = state.virtio_state.avail_features; let acked_features = state.virtio_state.acked_features; - let device_state = if state.virtio_state.activated { - DeviceState::Activated(constructor_args.mem) - } else { - DeviceState::Inactive - }; - let config_space = ConfigSpace { capacity: disk_properties.nsectors.to_le(), }; @@ -135,8 +123,7 @@ impl Persist<'_> for VirtioBlock { queues, queue_evts, - device_state, - irq_trigger, + device_state: DeviceState::Inactive, id: state.id.clone(), partuuid: state.partuuid.clone(), @@ -154,14 +141,12 @@ impl Persist<'_> for VirtioBlock { #[cfg(test)] mod tests { - use std::sync::atomic::Ordering; - use vmm_sys_util::tempfile::TempFile; use super::*; use crate::devices::virtio::block::virtio::device::VirtioBlockConfig; use crate::devices::virtio::device::VirtioDevice; - use crate::devices::virtio::test_utils::default_mem; + use crate::devices::virtio::test_utils::{default_interrupt, default_mem}; use crate::snapshot::Snapshot; #[test] @@ -243,11 +228,8 @@ mod tests { assert_eq!(restored_block.avail_features(), block.avail_features()); assert_eq!(restored_block.acked_features(), block.acked_features()); assert_eq!(restored_block.queues(), block.queues()); - assert_eq!( - restored_block.interrupt_status().load(Ordering::Relaxed), - block.interrupt_status().load(Ordering::Relaxed) - ); - assert_eq!(restored_block.is_activated(), block.is_activated()); + assert!(!block.is_activated()); + assert!(!restored_block.is_activated()); // Test that block specific fields are the same. assert_eq!(restored_block.disk.file_path, block.disk.file_path); diff --git a/src/vmm/src/devices/virtio/block/virtio/test_utils.rs b/src/vmm/src/devices/virtio/block/virtio/test_utils.rs index 02dd34fbce9..e4f23c6a038 100644 --- a/src/vmm/src/devices/virtio/block/virtio/test_utils.rs +++ b/src/vmm/src/devices/virtio/block/virtio/test_utils.rs @@ -17,9 +17,11 @@ use crate::devices::virtio::block::virtio::device::FileEngineType; use crate::devices::virtio::block::virtio::io::FileEngine; use crate::devices::virtio::block::virtio::{CacheType, VirtioBlock}; #[cfg(test)] -use crate::devices::virtio::device::IrqType; +use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::queue::{Queue, VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; use crate::devices::virtio::test_utils::{VirtQueue, VirtqDesc}; +#[cfg(test)] +use crate::devices::virtio::transport::VirtioInterruptType; use crate::rate_limiter::RateLimiter; use crate::vmm_config::{RateLimiterConfig, TokenBucketConfig}; use crate::vstate::memory::{Bytes, GuestAddress}; @@ -77,12 +79,17 @@ pub fn rate_limiter(blk: &mut VirtioBlock) -> &RateLimiter { #[cfg(test)] pub fn simulate_queue_event(b: &mut VirtioBlock, maybe_expected_irq: Option) { // Trigger the queue event. + b.queue_evts[0].write(1).unwrap(); // Handle event. b.process_queue_event(); // Validate the queue operation finished successfully. if let Some(expected_irq) = maybe_expected_irq { - assert_eq!(b.irq_trigger.has_pending_irq(IrqType::Vring), expected_irq); + assert_eq!( + b.interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(0)), + expected_irq + ); } } @@ -98,7 +105,11 @@ pub fn simulate_async_completion_event(b: &mut VirtioBlock, expected_irq: bool) } // Validate if there are pending IRQs. - assert_eq!(b.irq_trigger.has_pending_irq(IrqType::Vring), expected_irq); + assert_eq!( + b.interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(0)), + expected_irq + ); } #[cfg(test)] diff --git a/src/vmm/src/devices/virtio/device.rs b/src/vmm/src/devices/virtio/device.rs index ba1ca6b279e..ca3efc8cf2f 100644 --- a/src/vmm/src/devices/virtio/device.rs +++ b/src/vmm/src/devices/virtio/device.rs @@ -7,23 +7,30 @@ use std::fmt; use std::sync::Arc; -use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::atomic::AtomicU32; use vmm_sys_util::eventfd::EventFd; use super::ActivateError; -use super::mmio::{VIRTIO_MMIO_INT_CONFIG, VIRTIO_MMIO_INT_VRING}; use super::queue::{Queue, QueueError}; +use super::transport::VirtioInterrupt; use crate::devices::virtio::AsAny; -use crate::logger::{error, warn}; +use crate::logger::warn; use crate::vstate::memory::GuestMemoryMmap; +/// State of an active VirtIO device +#[derive(Debug, Clone)] +pub struct ActiveState { + pub mem: GuestMemoryMmap, + pub interrupt: Arc, +} + /// Enum that indicates if a VirtioDevice is inactive or has been activated /// and memory attached to it. #[derive(Debug)] pub enum DeviceState { Inactive, - Activated(GuestMemoryMmap), + Activated(ActiveState), } impl DeviceState { @@ -35,55 +42,15 @@ impl DeviceState { } } - /// Gets the memory attached to the device if it is activated. - pub fn mem(&self) -> Option<&GuestMemoryMmap> { + /// Gets the memory and interrupt attached to the device if it is activated. + pub fn active_state(&self) -> Option<&ActiveState> { match self { - DeviceState::Activated(mem) => Some(mem), + DeviceState::Activated(state) => Some(state), DeviceState::Inactive => None, } } } -/// The 2 types of interrupt sources in MMIO transport. -#[derive(Debug)] -pub enum IrqType { - /// Interrupt triggered by change in config. - Config, - /// Interrupt triggered by used vring buffers. - Vring, -} - -/// Helper struct that is responsible for triggering guest IRQs -#[derive(Debug)] -pub struct IrqTrigger { - pub(crate) irq_status: Arc, - pub(crate) irq_evt: EventFd, -} - -impl IrqTrigger { - pub fn new() -> std::io::Result { - Ok(Self { - irq_status: Arc::new(AtomicU32::new(0)), - irq_evt: EventFd::new(libc::EFD_NONBLOCK)?, - }) - } - - pub fn trigger_irq(&self, irq_type: IrqType) -> Result<(), std::io::Error> { - let irq = match irq_type { - IrqType::Config => VIRTIO_MMIO_INT_CONFIG, - IrqType::Vring => VIRTIO_MMIO_INT_VRING, - }; - self.irq_status.fetch_or(irq, Ordering::SeqCst); - - self.irq_evt.write(1).map_err(|err| { - error!("Failed to send irq to the guest: {:?}", err); - err - })?; - - Ok(()) - } -} - /// Trait for virtio devices to be driven by a virtio transport. /// /// The lifecycle of a virtio device is to be moved to a virtio transport, which will then query the @@ -121,10 +88,10 @@ pub trait VirtioDevice: AsAny + Send { /// Returns the current device interrupt status. fn interrupt_status(&self) -> Arc { - Arc::clone(&self.interrupt_trigger().irq_status) + self.interrupt_trigger().status() } - fn interrupt_trigger(&self) -> &IrqTrigger; + fn interrupt_trigger(&self) -> &dyn VirtioInterrupt; /// The set of feature bits shifted by `page * 32`. fn avail_features_by_page(&self, page: u32) -> u32 { @@ -170,14 +137,18 @@ pub trait VirtioDevice: AsAny + Send { fn write_config(&mut self, offset: u64, data: &[u8]); /// Performs the formal activation for a device, which can be verified also with `is_activated`. - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError>; + fn activate( + &mut self, + mem: GuestMemoryMmap, + interrupt: Arc, + ) -> Result<(), ActivateError>; /// Checks if the resources of this device are activated. fn is_activated(&self) -> bool; /// Optionally deactivates this device and returns ownership of the guest memory map, interrupt /// event, and queue events. - fn reset(&mut self) -> Option<(EventFd, Vec)> { + fn reset(&mut self) -> Option<(Arc, Vec)> { None } @@ -188,6 +159,9 @@ pub trait VirtioDevice: AsAny + Send { } Ok(()) } + + /// Kick the device, as if it had received external events. + fn kick(&mut self) {} } impl fmt::Debug for dyn VirtioDevice { @@ -200,47 +174,6 @@ impl fmt::Debug for dyn VirtioDevice { pub(crate) mod tests { use super::*; - impl IrqTrigger { - pub fn has_pending_irq(&self, irq_type: IrqType) -> bool { - if let Ok(num_irqs) = self.irq_evt.read() { - if num_irqs == 0 { - return false; - } - - let irq_status = self.irq_status.load(Ordering::SeqCst); - return matches!( - (irq_status, irq_type), - (VIRTIO_MMIO_INT_CONFIG, IrqType::Config) - | (VIRTIO_MMIO_INT_VRING, IrqType::Vring) - ); - } - - false - } - } - - #[test] - fn irq_trigger() { - let irq_trigger = IrqTrigger::new().unwrap(); - assert_eq!(irq_trigger.irq_status.load(Ordering::SeqCst), 0); - - // Check that there are no pending irqs. - assert!(!irq_trigger.has_pending_irq(IrqType::Config)); - assert!(!irq_trigger.has_pending_irq(IrqType::Vring)); - - // Check that trigger_irq() correctly generates irqs. - irq_trigger.trigger_irq(IrqType::Config).unwrap(); - assert!(irq_trigger.has_pending_irq(IrqType::Config)); - irq_trigger.irq_status.store(0, Ordering::SeqCst); - irq_trigger.trigger_irq(IrqType::Vring).unwrap(); - assert!(irq_trigger.has_pending_irq(IrqType::Vring)); - - // Check trigger_irq() failure case (irq_evt is full). - irq_trigger.irq_evt.write(u64::MAX - 1).unwrap(); - irq_trigger.trigger_irq(IrqType::Config).unwrap_err(); - irq_trigger.trigger_irq(IrqType::Vring).unwrap_err(); - } - #[derive(Debug)] struct MockVirtioDevice { acked_features: u64, @@ -275,7 +208,7 @@ pub(crate) mod tests { todo!() } - fn interrupt_trigger(&self) -> &IrqTrigger { + fn interrupt_trigger(&self) -> &dyn VirtioInterrupt { todo!() } @@ -287,7 +220,11 @@ pub(crate) mod tests { todo!() } - fn activate(&mut self, _mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate( + &mut self, + _mem: GuestMemoryMmap, + _interrupt: Arc, + ) -> Result<(), ActivateError> { todo!() } diff --git a/src/vmm/src/devices/virtio/mod.rs b/src/vmm/src/devices/virtio/mod.rs index f298d28e9bd..0ac3b660397 100644 --- a/src/vmm/src/devices/virtio/mod.rs +++ b/src/vmm/src/devices/virtio/mod.rs @@ -18,12 +18,12 @@ pub mod device; pub mod generated; mod iov_deque; pub mod iovec; -pub mod mmio; pub mod net; pub mod persist; pub mod queue; pub mod rng; pub mod test_utils; +pub mod transport; pub mod vhost_user; pub mod vhost_user_metrics; pub mod vsock; diff --git a/src/vmm/src/devices/virtio/net/device.rs b/src/vmm/src/devices/virtio/net/device.rs index 2ce60707271..0b2f3150c09 100755 --- a/src/vmm/src/devices/virtio/net/device.rs +++ b/src/vmm/src/devices/virtio/net/device.rs @@ -8,14 +8,16 @@ use std::collections::VecDeque; use std::mem::{self}; use std::net::Ipv4Addr; +use std::num::Wrapping; +use std::ops::Deref; use std::sync::{Arc, Mutex}; use libc::{EAGAIN, iovec}; -use log::error; +use log::{error, info}; use vmm_sys_util::eventfd::EventFd; use super::NET_QUEUE_MAX_SIZE; -use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice}; +use crate::devices::virtio::device::{ActiveState, DeviceState, VirtioDevice}; use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; use crate::devices::virtio::generated::virtio_net::{ VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, @@ -32,6 +34,7 @@ use crate::devices::virtio::net::{ MAX_BUFFER_SIZE, NET_QUEUE_SIZES, NetError, NetQueue, RX_INDEX, TX_INDEX, generated, }; use crate::devices::virtio::queue::{DescriptorChain, InvalidAvailIdx, Queue}; +use crate::devices::virtio::transport::{VirtioInterrupt, VirtioInterruptType}; use crate::devices::virtio::{ActivateError, TYPE_NET}; use crate::devices::{DeviceError, report_net_event_fail}; use crate::dumbo::pdu::arp::ETH_IPV4_FRAME_LEN; @@ -249,8 +252,6 @@ pub struct Net { tx_frame_headers: [u8; frame_hdr_len()], - pub(crate) irq_trigger: IrqTrigger, - pub(crate) config_space: ConfigSpace, pub(crate) guest_mac: Option, @@ -313,7 +314,6 @@ impl Net { tx_rate_limiter, rx_frame_buf: [0u8; MAX_BUFFER_SIZE], tx_frame_headers: [0u8; frame_hdr_len()], - irq_trigger: IrqTrigger::new().map_err(NetError::EventFd)?, config_space, guest_mac, device_state: DeviceState::Inactive, @@ -392,15 +392,15 @@ impl Net { /// https://docs.oasis-open.org/virtio/virtio/v1.1/csprd01/virtio-v1.1-csprd01.html#x1-320005 /// 2.6.7.1 Driver Requirements: Used Buffer Notification Suppression fn try_signal_queue(&mut self, queue_type: NetQueue) -> Result<(), DeviceError> { - let queue = match queue_type { - NetQueue::Rx => &mut self.queues[RX_INDEX], - NetQueue::Tx => &mut self.queues[TX_INDEX], + let qidx = match queue_type { + NetQueue::Rx => RX_INDEX, + NetQueue::Tx => TX_INDEX, }; - queue.advance_used_ring_idx(); + self.queues[qidx].advance_used_ring_idx(); - if queue.prepare_kick() { - self.irq_trigger - .trigger_irq(IrqType::Vring) + if self.queues[qidx].prepare_kick() { + self.interrupt_trigger() + .trigger(VirtioInterruptType::Queue(qidx.try_into().unwrap())) .map_err(|err| { self.metrics.event_fails.inc(); DeviceError::FailedSignalingIrq(err) @@ -464,7 +464,7 @@ impl Net { /// Parse available RX `DescriptorChains` from the queue pub fn parse_rx_descriptors(&mut self) -> Result<(), InvalidAvailIdx> { // This is safe since we checked in the event handler that the device is activated. - let mem = self.device_state.mem().unwrap(); + let mem = &self.device_state.active_state().unwrap().mem; let queue = &mut self.queues[RX_INDEX]; while let Some(head) = queue.pop_or_enable_notification()? { let index = head.index; @@ -686,7 +686,7 @@ impl Net { fn process_tx(&mut self) -> Result<(), DeviceError> { // This is safe since we checked in the event handler that the device is activated. - let mem = self.device_state.mem().unwrap(); + let mem = &self.device_state.active_state().unwrap().mem; // The MMDS network stack works like a state machine, based on synchronous calls, and // without being added to any event loop. If any frame is accepted by the MMDS, we also @@ -937,6 +937,26 @@ impl Net { Ok(()) } + + /// Prepare saving state + pub fn prepare_save(&mut self) { + // We shouldn't be messing with the queue if the device is not activated. + // Anyways, if it isn't there's nothing to prepare; we haven't parsed any + // descriptors yet from it and we can't have a deferred frame. + if !self.is_activated() { + return; + } + + // Give potential deferred RX frame to guest + self.rx_buffer.finish_frame(&mut self.queues[RX_INDEX]); + // Reset the parsed available descriptors, so we will re-parse them + self.queues[RX_INDEX].next_avail -= + Wrapping(u16::try_from(self.rx_buffer.parsed_descriptors.len()).unwrap()); + self.rx_buffer.parsed_descriptors.clear(); + self.rx_buffer.iovec.clear(); + self.rx_buffer.used_bytes = 0; + self.rx_buffer.used_descriptors = 0; + } } impl VirtioDevice for Net { @@ -968,9 +988,14 @@ impl VirtioDevice for Net { &self.queue_evts } - fn interrupt_trigger(&self) -> &IrqTrigger { - &self.irq_trigger + fn interrupt_trigger(&self) -> &dyn VirtioInterrupt { + self.device_state + .active_state() + .expect("Device is not implemented") + .interrupt + .deref() } + fn read_config(&self, offset: u64, data: &mut [u8]) { if let Some(config_space_bytes) = self.config_space.as_slice().get(u64_to_usize(offset)..) { let len = config_space_bytes.len().min(data.len()); @@ -999,7 +1024,11 @@ impl VirtioDevice for Net { self.metrics.mac_address_updates.inc(); } - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate( + &mut self, + mem: GuestMemoryMmap, + interrupt: Arc, + ) -> Result<(), ActivateError> { for q in self.queues.iter_mut() { q.initialize(&mem) .map_err(ActivateError::QueueMemoryError)?; @@ -1023,13 +1052,24 @@ impl VirtioDevice for Net { self.metrics.activate_fails.inc(); return Err(ActivateError::EventFd); } - self.device_state = DeviceState::Activated(mem); + self.device_state = DeviceState::Activated(ActiveState { mem, interrupt }); Ok(()) } fn is_activated(&self) -> bool { self.device_state.is_activated() } + + fn kick(&mut self) { + // If device is activated, kick the net queue(s) to make up for any + // pending or in-flight epoll events we may have not captured in snapshot. + // No need to kick Ratelimiters because they are restored 'unblocked' so + // any inflight `timer_fd` events can be safely discarded. + if self.is_activated() { + info!("kick net {}.", self.id()); + self.process_virtio_queues(); + } + } } #[cfg(test)] @@ -1401,7 +1441,12 @@ pub mod tests { // Check that the used queue has advanced. assert_eq!(th.rxq.used.idx.get(), 4); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(RX_INDEX as u16)) + ); + // Check that the invalid descriptor chains have been discarded th.rxq.check_used_elem(0, 0, 0); th.rxq.check_used_elem(1, 3, 0); @@ -1458,7 +1503,11 @@ pub mod tests { assert!(th.net().rx_buffer.used_descriptors == 0); // Check that the used queue has advanced. assert_eq!(th.rxq.used.idx.get(), 1); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(RX_INDEX as u16)) + ); // Check that the frame has been written successfully to the Rx descriptor chain. header_set_num_buffers(frame.as_mut_slice(), 1); th.rxq @@ -1521,7 +1570,11 @@ pub mod tests { assert!(th.net().rx_buffer.used_bytes == 0); // Check that the used queue has advanced. assert_eq!(th.rxq.used.idx.get(), 2); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(RX_INDEX as u16)) + ); // Check that the 1st frame was written successfully to the 1st Rx descriptor chain. header_set_num_buffers(frame_1.as_mut_slice(), 1); th.rxq @@ -1579,7 +1632,11 @@ pub mod tests { assert!(th.net().rx_buffer.used_bytes == 0); // Check that the used queue has advanced. assert_eq!(th.rxq.used.idx.get(), 2); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(RX_INDEX as u16)) + ); // 2 chains should be used for the packet. header_set_num_buffers(frame.as_mut_slice(), 2); @@ -1644,7 +1701,11 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(TX_INDEX as u16)) + ); th.txq.check_used_elem(0, 0, 0); // Check that the frame was skipped. assert!(!tap_traffic_simulator.pop_rx_packet(&mut [])); @@ -1667,7 +1728,11 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(TX_INDEX as u16)) + ); th.txq.check_used_elem(0, 0, 0); // Check that the frame was skipped. assert!(!tap_traffic_simulator.pop_rx_packet(&mut [])); @@ -1694,7 +1759,11 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(TX_INDEX as u16)) + ); th.txq.check_used_elem(0, 0, 0); // Check that the frame was skipped. assert!(!tap_traffic_simulator.pop_rx_packet(&mut [])); @@ -1717,7 +1786,11 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(TX_INDEX as u16)) + ); th.txq.check_used_elem(0, 0, 0); // Check that the frame was skipped. assert!(!tap_traffic_simulator.pop_rx_packet(&mut [])); @@ -1756,7 +1829,11 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 4); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(TX_INDEX as u16)) + ); th.txq.check_used_elem(3, 4, 0); // Check that the valid frame was sent to the tap. let mut buf = vec![0; 1000]; @@ -1787,7 +1864,11 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(TX_INDEX as u16)) + ); th.txq.check_used_elem(0, 3, 0); // Check that the frame was sent to the tap. let mut buf = vec![0; 1000]; @@ -1816,7 +1897,11 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(TX_INDEX as u16)) + ); th.txq.check_used_elem(0, 0, 0); // dropping th would double close the tap fd, so leak it @@ -1847,7 +1932,11 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 2); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(TX_INDEX as u16)) + ); th.txq.check_used_elem(0, 0, 0); th.txq.check_used_elem(1, 3, 0); // Check that the first frame was sent to the tap. @@ -2199,7 +2288,11 @@ pub mod tests { assert_eq!(th.net().metrics.rx_rate_limiter_throttled.count(), 1); assert!(th.net().rx_buffer.used_descriptors != 0); // assert that no operation actually completed (limiter blocked it) - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(RX_INDEX as u16)) + ); // make sure the data is still queued for processing assert_eq!(th.rxq.used.idx.get(), 0); } @@ -2227,7 +2320,11 @@ pub mod tests { // validate the rate_limiter is no longer blocked assert!(!th.net().rx_rate_limiter.is_blocked()); // make sure the virtio queue operation completed this time - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(RX_INDEX as u16)) + ); // make sure the data queue advanced assert_eq!(th.rxq.used.idx.get(), 1); th.rxq @@ -2324,14 +2421,22 @@ pub mod tests { assert!(th.net().metrics.rx_rate_limiter_throttled.count() >= 1); assert!(th.net().rx_buffer.used_descriptors != 0); // assert that no operation actually completed (limiter blocked it) - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(RX_INDEX as u16)) + ); // make sure the data is still queued for processing assert_eq!(th.rxq.used.idx.get(), 0); // trigger the RX handler again, this time it should do the limiter fast path exit th.simulate_event(NetEvent::Tap); // assert that no operation actually completed, that the limiter blocked it - assert!(!&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!( + !th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(RX_INDEX as u16)) + ); // make sure the data is still queued for processing assert_eq!(th.rxq.used.idx.get(), 0); } @@ -2344,7 +2449,11 @@ pub mod tests { { th.simulate_event(NetEvent::RxRateLimiter); // make sure the virtio queue operation completed this time - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(RX_INDEX as u16)) + ); // make sure the data queue advanced assert_eq!(th.rxq.used.idx.get(), 1); th.rxq @@ -2414,7 +2523,14 @@ pub mod tests { assert_eq!(net.queue_events().len(), NET_QUEUE_SIZES.len()); // Test interrupts. - assert!(!&net.irq_trigger.has_pending_irq(IrqType::Vring)); + assert!( + !net.interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(RX_INDEX as u16)) + ); + assert!( + !net.interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(TX_INDEX as u16)) + ); } #[test] diff --git a/src/vmm/src/devices/virtio/net/persist.rs b/src/vmm/src/devices/virtio/net/persist.rs index 50e761273db..6ef8ad842ac 100644 --- a/src/vmm/src/devices/virtio/net/persist.rs +++ b/src/vmm/src/devices/virtio/net/persist.rs @@ -4,7 +4,6 @@ //! Defines the structures needed for saving/restoring net devices. use std::io; -use std::sync::atomic::AtomicU32; use std::sync::{Arc, Mutex}; use serde::{Deserialize, Serialize}; @@ -12,8 +11,9 @@ use serde::{Deserialize, Serialize}; use super::device::{Net, RxBuffers}; use super::{NET_NUM_QUEUES, NET_QUEUE_MAX_SIZE, RX_INDEX, TapError}; use crate::devices::virtio::TYPE_NET; -use crate::devices::virtio::device::DeviceState; +use crate::devices::virtio::device::{ActiveState, DeviceState}; use crate::devices::virtio::persist::{PersistError as VirtioStateError, VirtioDeviceState}; +use crate::devices::virtio::transport::VirtioInterrupt; use crate::mmds::data_store::Mmds; use crate::mmds::ns::MmdsNetworkStack; use crate::mmds::persist::MmdsNetworkStackState; @@ -30,27 +30,6 @@ pub struct NetConfigSpaceState { guest_mac: Option, } -/// Information about the parsed RX buffers -#[derive(Debug, Default, Clone, Serialize, Deserialize)] -pub struct RxBufferState { - // Number of iovecs we have parsed from the guest - parsed_descriptor_chains_nr: u16, - // Number of used descriptors - used_descriptors: u16, - // Number of used bytes - used_bytes: u32, -} - -impl RxBufferState { - fn from_rx_buffers(rx_buffer: &RxBuffers) -> Self { - RxBufferState { - parsed_descriptor_chains_nr: rx_buffer.parsed_descriptors.len().try_into().unwrap(), - used_descriptors: rx_buffer.used_descriptors, - used_bytes: rx_buffer.used_bytes, - } - } -} - /// Information about the network device that are saved /// at snapshot. #[derive(Debug, Clone, Serialize, Deserialize)] @@ -62,8 +41,7 @@ pub struct NetState { /// The associated MMDS network stack. pub mmds_ns: Option, config_space: NetConfigSpaceState, - virtio_state: VirtioDeviceState, - rx_buffers_state: RxBufferState, + pub virtio_state: VirtioDeviceState, } /// Auxiliary structure for creating a device when resuming from a snapshot. @@ -106,7 +84,6 @@ impl Persist<'_> for Net { guest_mac: self.guest_mac, }, virtio_state: VirtioDeviceState::from_device(self), - rx_buffers_state: RxBufferState::from_rx_buffers(&self.rx_buffer), } } @@ -148,39 +125,20 @@ impl Persist<'_> for Net { NET_NUM_QUEUES, NET_QUEUE_MAX_SIZE, )?; - net.irq_trigger.irq_status = Arc::new(AtomicU32::new(state.virtio_state.interrupt_status)); net.avail_features = state.virtio_state.avail_features; net.acked_features = state.virtio_state.acked_features; - if state.virtio_state.activated { - let supported_flags: u32 = Net::build_tap_offload_features(net.acked_features); - net.tap - .set_offload(supported_flags) - .map_err(NetPersistError::TapSetOffload)?; - - net.device_state = DeviceState::Activated(constructor_args.mem); - - // Recreate `Net::rx_buffer`. We do it by re-parsing the RX queue. We're temporarily - // rolling back `next_avail` in the RX queue and call `parse_rx_descriptors`. - net.queues[RX_INDEX].next_avail -= state.rx_buffers_state.parsed_descriptor_chains_nr; - net.parse_rx_descriptors() - .map_err(|e| NetPersistError::VirtioState(VirtioStateError::InvalidAvailIdx(e)))?; - net.rx_buffer.used_descriptors = state.rx_buffers_state.used_descriptors; - net.rx_buffer.used_bytes = state.rx_buffers_state.used_bytes; - } - Ok(net) } } #[cfg(test)] mod tests { - use std::sync::atomic::Ordering; use super::*; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::net::test_utils::{default_net, default_net_no_mmds}; - use crate::devices::virtio::test_utils::default_mem; + use crate::devices::virtio::test_utils::{default_interrupt, default_mem}; use crate::snapshot::Snapshot; fn validate_save_and_restore(net: Net, mmds_ds: Option>>) { @@ -222,10 +180,6 @@ mod tests { assert_eq!(restored_net.device_type(), TYPE_NET); assert_eq!(restored_net.avail_features(), virtio_state.avail_features); assert_eq!(restored_net.acked_features(), virtio_state.acked_features); - assert_eq!( - restored_net.interrupt_status().load(Ordering::Relaxed), - virtio_state.interrupt_status - ); assert_eq!(restored_net.is_activated(), virtio_state.activated); // Test that net specific fields are the same. diff --git a/src/vmm/src/devices/virtio/net/test_utils.rs b/src/vmm/src/devices/virtio/net/test_utils.rs index 2df7891e034..b4fbdf97e3f 100644 --- a/src/vmm/src/devices/virtio/net/test_utils.rs +++ b/src/vmm/src/devices/virtio/net/test_utils.rs @@ -103,7 +103,7 @@ impl TapTrafficSimulator { let send_addr_ptr = &mut storage as *mut libc::sockaddr_storage; - // SAFETY: `sock_addr` is a valid pointer and safe to derference. + // SAFETY: `sock_addr` is a valid pointer and safe to dereference. unsafe { let sock_addr: *mut libc::sockaddr_ll = send_addr_ptr.cast::(); (*sock_addr).sll_family = libc::sa_family_t::try_from(libc::AF_PACKET).unwrap(); @@ -222,7 +222,7 @@ pub fn if_index(tap: &Tap) -> i32 { /// Enable the tap interface. pub fn enable(tap: &Tap) { - // Disable IPv6 router advertisment requests + // Disable IPv6 router advertisement requests Command::new("sh") .arg("-c") .arg(format!( @@ -291,7 +291,7 @@ pub mod test { use event_manager::{EventManager, SubscriberId, SubscriberOps}; use crate::check_metric_after_block; - use crate::devices::virtio::device::{IrqType, VirtioDevice}; + use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::net::device::vnet_hdr_len; use crate::devices::virtio::net::generated::ETH_HLEN; use crate::devices::virtio::net::test_utils::{ @@ -299,7 +299,8 @@ pub mod test { }; use crate::devices::virtio::net::{MAX_BUFFER_SIZE, Net, RX_INDEX, TX_INDEX}; use crate::devices::virtio::queue::{VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; - use crate::devices::virtio::test_utils::{VirtQueue, VirtqDesc}; + use crate::devices::virtio::test_utils::{VirtQueue, VirtqDesc, default_interrupt}; + use crate::devices::virtio::transport::VirtioInterruptType; use crate::logger::IncMetric; use crate::vstate::memory::{Address, Bytes, GuestAddress, GuestMemoryMmap}; @@ -358,7 +359,12 @@ pub mod test { } pub fn activate_net(&mut self) { - self.net.lock().unwrap().activate(self.mem.clone()).unwrap(); + let interrupt = default_interrupt(); + self.net + .lock() + .unwrap() + .activate(self.mem.clone(), interrupt) + .unwrap(); // Process the activate event. let ev_count = self.event_manager.run_with_timeout(100).unwrap(); assert_eq!(ev_count, 1); @@ -435,7 +441,11 @@ pub mod test { old_used_descriptors + 1 ); - assert!(&self.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!( + self.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(RX_INDEX as u16)) + ); frame } @@ -461,7 +471,11 @@ pub mod test { ); // Check that the expected frame was sent to the Rx queue eventually. assert_eq!(self.rxq.used.idx.get(), used_idx + 1); - assert!(&self.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!( + self.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(RX_INDEX as u16)) + ); self.rxq .check_used_elem(used_idx, 0, expected_frame.len().try_into().unwrap()); self.rxq.dtable[0].check_data(expected_frame); diff --git a/src/vmm/src/devices/virtio/persist.rs b/src/vmm/src/devices/virtio/persist.rs index 664f6d57efb..776c7179048 100644 --- a/src/vmm/src/devices/virtio/persist.rs +++ b/src/vmm/src/devices/virtio/persist.rs @@ -10,10 +10,11 @@ use std::sync::{Arc, Mutex}; use serde::{Deserialize, Serialize}; use super::queue::{InvalidAvailIdx, QueueError}; +use super::transport::mmio::IrqTrigger; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::generated::virtio_ring::VIRTIO_RING_F_EVENT_IDX; -use crate::devices::virtio::mmio::MmioTransport; use crate::devices::virtio::queue::Queue; +use crate::devices::virtio::transport::mmio::MmioTransport; use crate::snapshot::Persist; use crate::vstate::memory::{GuestAddress, GuestMemoryMmap}; @@ -123,8 +124,6 @@ pub struct VirtioDeviceState { pub acked_features: u64, /// List of queues. pub queues: Vec, - /// The MMIO interrupt status. - pub interrupt_status: u32, /// Flag for activated status. pub activated: bool, } @@ -137,7 +136,6 @@ impl VirtioDeviceState { avail_features: device.avail_features(), acked_features: device.acked_features(), queues: device.queues().iter().map(Persist::save).collect(), - interrupt_status: device.interrupt_status().load(Ordering::Relaxed), activated: device.is_activated(), } } @@ -202,6 +200,7 @@ pub struct MmioTransportState { queue_select: u32, device_status: u32, config_generation: u32, + interrupt_status: u32, } /// Auxiliary structure for initializing the transport when resuming from a snapshot. @@ -209,6 +208,8 @@ pub struct MmioTransportState { pub struct MmioTransportConstructorArgs { /// Pointer to guest memory. pub mem: GuestMemoryMmap, + /// Interrupt to use for the device + pub interrupt: Arc, /// Device associated with the current MMIO state. pub device: Arc>, /// Is device backed by vhost-user. @@ -227,6 +228,7 @@ impl Persist<'_> for MmioTransport { queue_select: self.queue_select, device_status: self.device_status, config_generation: self.config_generation, + interrupt_status: self.interrupt.irq_status.load(Ordering::SeqCst), } } @@ -236,6 +238,7 @@ impl Persist<'_> for MmioTransport { ) -> Result { let mut transport = MmioTransport::new( constructor_args.mem, + constructor_args.interrupt, constructor_args.device, constructor_args.is_vhost_user, ); @@ -244,6 +247,10 @@ impl Persist<'_> for MmioTransport { transport.queue_select = state.queue_select; transport.device_status = state.device_status; transport.config_generation = state.config_generation; + transport + .interrupt + .irq_status + .store(state.interrupt_status, Ordering::SeqCst); Ok(transport) } } @@ -256,10 +263,10 @@ mod tests { use crate::devices::virtio::block::virtio::VirtioBlock; use crate::devices::virtio::block::virtio::device::FileEngineType; use crate::devices::virtio::block::virtio::test_utils::default_block_with_path; - use crate::devices::virtio::mmio::tests::DummyDevice; use crate::devices::virtio::net::Net; use crate::devices::virtio::net::test_utils::default_net; use crate::devices::virtio::test_utils::default_mem; + use crate::devices::virtio::transport::mmio::tests::DummyDevice; use crate::devices::virtio::vsock::{Vsock, VsockUnixBackend}; use crate::snapshot::Snapshot; @@ -383,7 +390,7 @@ mod tests { self.queue_select == other.queue_select && self.device_status == other.device_status && self.config_generation == other.config_generation && - self.interrupt_status.load(Ordering::SeqCst) == other.interrupt_status.load(Ordering::SeqCst) && + self.interrupt.irq_status.load(Ordering::SeqCst) == other.interrupt.irq_status.load(Ordering::SeqCst) && // Only checking equality of device type, actual device (de)ser is tested by that // device's tests. self_dev_type == other.device().lock().unwrap().device_type() @@ -392,6 +399,7 @@ mod tests { fn generic_mmiotransport_persistence_test( mmio_transport: MmioTransport, + interrupt: Arc, mem: GuestMemoryMmap, device: Arc>, ) { @@ -401,6 +409,7 @@ mod tests { let restore_args = MmioTransportConstructorArgs { mem, + interrupt, device, is_vhost_user: false, }; @@ -413,8 +422,14 @@ mod tests { assert_eq!(restored_mmio_transport, mmio_transport); } - fn create_default_block() -> (MmioTransport, GuestMemoryMmap, Arc>) { + fn create_default_block() -> ( + MmioTransport, + Arc, + GuestMemoryMmap, + Arc>, + ) { let mem = default_mem(); + let interrupt = Arc::new(IrqTrigger::new()); // Create backing file. let f = TempFile::new().unwrap(); @@ -424,25 +439,34 @@ mod tests { FileEngineType::default(), ); let block = Arc::new(Mutex::new(block)); - let mmio_transport = MmioTransport::new(mem.clone(), block.clone(), false); + let mmio_transport = + MmioTransport::new(mem.clone(), interrupt.clone(), block.clone(), false); - (mmio_transport, mem, block) + (mmio_transport, interrupt, mem, block) } - fn create_default_net() -> (MmioTransport, GuestMemoryMmap, Arc>) { + fn create_default_net() -> ( + MmioTransport, + Arc, + GuestMemoryMmap, + Arc>, + ) { let mem = default_mem(); + let interrupt = Arc::new(IrqTrigger::new()); let net = Arc::new(Mutex::new(default_net())); - let mmio_transport = MmioTransport::new(mem.clone(), net.clone(), false); + let mmio_transport = MmioTransport::new(mem.clone(), interrupt.clone(), net.clone(), false); - (mmio_transport, mem, net) + (mmio_transport, interrupt, mem, net) } fn default_vsock() -> ( MmioTransport, + Arc, GuestMemoryMmap, Arc>>, ) { let mem = default_mem(); + let interrupt = Arc::new(IrqTrigger::new()); let guest_cid = 52; let mut temp_uds_path = TempFile::new().unwrap(); @@ -452,26 +476,27 @@ mod tests { let backend = VsockUnixBackend::new(guest_cid, uds_path).unwrap(); let vsock = Vsock::new(guest_cid, backend).unwrap(); let vsock = Arc::new(Mutex::new(vsock)); - let mmio_transport = MmioTransport::new(mem.clone(), vsock.clone(), false); + let mmio_transport = + MmioTransport::new(mem.clone(), interrupt.clone(), vsock.clone(), false); - (mmio_transport, mem, vsock) + (mmio_transport, interrupt, mem, vsock) } #[test] fn test_block_over_mmiotransport_persistence() { - let (mmio_transport, mem, block) = create_default_block(); - generic_mmiotransport_persistence_test(mmio_transport, mem, block); + let (mmio_transport, interrupt, mem, block) = create_default_block(); + generic_mmiotransport_persistence_test(mmio_transport, interrupt, mem, block); } #[test] fn test_net_over_mmiotransport_persistence() { - let (mmio_transport, mem, net) = create_default_net(); - generic_mmiotransport_persistence_test(mmio_transport, mem, net); + let (mmio_transport, interrupt, mem, net) = create_default_net(); + generic_mmiotransport_persistence_test(mmio_transport, interrupt, mem, net); } #[test] fn test_vsock_over_mmiotransport_persistence() { - let (mmio_transport, mem, vsock) = default_vsock(); - generic_mmiotransport_persistence_test(mmio_transport, mem, vsock); + let (mmio_transport, interrupt, mem, vsock) = default_vsock(); + generic_mmiotransport_persistence_test(mmio_transport, interrupt, mem, vsock); } } diff --git a/src/vmm/src/devices/virtio/queue.rs b/src/vmm/src/devices/virtio/queue.rs index ec845fe6394..7fd862f45ca 100644 --- a/src/vmm/src/devices/virtio/queue.rs +++ b/src/vmm/src/devices/virtio/queue.rs @@ -20,7 +20,7 @@ pub(super) const FIRECRACKER_MAX_QUEUE_SIZE: u16 = 256; // GuestMemoryMmap::read_obj_from_addr() will be used to fetch the descriptor, // which has an explicit constraint that the entire descriptor doesn't -// cross the page boundary. Otherwise the descriptor may be splitted into +// cross the page boundary. Otherwise the descriptor may be split into // two mmap regions which causes failure of GuestMemoryMmap::read_obj_from_addr(). // // The Virtio Spec 1.0 defines the alignment of VirtIO descriptor is 16 bytes, @@ -280,7 +280,7 @@ impl Queue { pub fn new(max_size: u16) -> Queue { Queue { max_size, - size: 0, + size: max_size, ready: false, desc_table_address: GuestAddress(0), avail_ring_address: GuestAddress(0), @@ -669,6 +669,19 @@ impl Queue { new - used_event - Wrapping(1) < new - old } + + /// Resets the Virtio Queue + pub(crate) fn reset(&mut self) { + self.ready = false; + self.size = self.max_size; + self.desc_table_address = GuestAddress(0); + self.avail_ring_address = GuestAddress(0); + self.used_ring_address = GuestAddress(0); + self.next_avail = Wrapping(0); + self.next_used = Wrapping(0); + self.num_added = Wrapping(0); + self.uses_notif_suppression = false; + } } #[cfg(kani)] diff --git a/src/vmm/src/devices/virtio/rng/device.rs b/src/vmm/src/devices/virtio/rng/device.rs index 38308e9b6b7..2cf1c6bf5dd 100644 --- a/src/vmm/src/devices/virtio/rng/device.rs +++ b/src/vmm/src/devices/virtio/rng/device.rs @@ -2,21 +2,23 @@ // SPDX-License-Identifier: Apache-2.0 use std::io; +use std::ops::Deref; use std::sync::Arc; -use std::sync::atomic::AtomicU32; use aws_lc_rs::rand; +use log::info; use vm_memory::GuestMemoryError; use vmm_sys_util::eventfd::EventFd; use super::metrics::METRICS; use super::{RNG_NUM_QUEUES, RNG_QUEUE}; use crate::devices::DeviceError; -use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice}; +use crate::devices::virtio::device::{ActiveState, DeviceState, VirtioDevice}; use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; use crate::devices::virtio::iov_deque::IovDequeError; use crate::devices::virtio::iovec::IoVecBufferMut; use crate::devices::virtio::queue::{FIRECRACKER_MAX_QUEUE_SIZE, InvalidAvailIdx, Queue}; +use crate::devices::virtio::transport::{VirtioInterrupt, VirtioInterruptType}; use crate::devices::virtio::{ActivateError, TYPE_RNG}; use crate::logger::{IncMetric, debug, error}; use crate::rate_limiter::{RateLimiter, TokenType}; @@ -47,7 +49,6 @@ pub struct Entropy { device_state: DeviceState, pub(crate) queues: Vec, queue_events: Vec, - irq_trigger: IrqTrigger, // Device specific fields rate_limiter: RateLimiter, @@ -69,7 +70,6 @@ impl Entropy { let queue_events = (0..RNG_NUM_QUEUES) .map(|_| EventFd::new(libc::EFD_NONBLOCK)) .collect::, io::Error>>()?; - let irq_trigger = IrqTrigger::new()?; Ok(Self { avail_features: 1 << VIRTIO_F_VERSION_1, @@ -78,7 +78,6 @@ impl Entropy { device_state: DeviceState::Inactive, queues, queue_events, - irq_trigger, rate_limiter, buffer: IoVecBufferMut::new()?, }) @@ -89,8 +88,8 @@ impl Entropy { } fn signal_used_queue(&self) -> Result<(), DeviceError> { - self.irq_trigger - .trigger_irq(IrqType::Vring) + self.interrupt_trigger() + .trigger(VirtioInterruptType::Queue(RNG_QUEUE.try_into().unwrap())) .map_err(DeviceError::FailedSignalingIrq) } @@ -132,7 +131,7 @@ impl Entropy { let mut used_any = false; while let Some(desc) = self.queues[RNG_QUEUE].pop()? { // This is safe since we checked in the event handler that the device is activated. - let mem = self.device_state.mem().unwrap(); + let mem = &self.device_state.active_state().unwrap().mem; let index = desc.index; METRICS.entropy_event_count.inc(); @@ -239,12 +238,12 @@ impl Entropy { self.acked_features = features; } - pub(crate) fn set_irq_status(&mut self, status: u32) { - self.irq_trigger.irq_status = Arc::new(AtomicU32::new(status)); - } - - pub(crate) fn set_activated(&mut self, mem: GuestMemoryMmap) { - self.device_state = DeviceState::Activated(mem); + pub(crate) fn set_activated( + &mut self, + mem: GuestMemoryMmap, + interrupt: Arc, + ) { + self.device_state = DeviceState::Activated(ActiveState { mem, interrupt }); } pub(crate) fn activate_event(&self) -> &EventFd { @@ -269,8 +268,12 @@ impl VirtioDevice for Entropy { &self.queue_events } - fn interrupt_trigger(&self) -> &IrqTrigger { - &self.irq_trigger + fn interrupt_trigger(&self) -> &dyn VirtioInterrupt { + self.device_state + .active_state() + .expect("Device is not initialized") + .interrupt + .deref() } fn avail_features(&self) -> u64 { @@ -293,7 +296,11 @@ impl VirtioDevice for Entropy { self.device_state.is_activated() } - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate( + &mut self, + mem: GuestMemoryMmap, + interrupt: Arc, + ) -> Result<(), ActivateError> { for q in self.queues.iter_mut() { q.initialize(&mem) .map_err(ActivateError::QueueMemoryError)?; @@ -303,9 +310,16 @@ impl VirtioDevice for Entropy { METRICS.activate_fails.inc(); ActivateError::EventFd })?; - self.device_state = DeviceState::Activated(mem); + self.device_state = DeviceState::Activated(ActiveState { mem, interrupt }); Ok(()) } + + fn kick(&mut self) { + if self.is_activated() { + info!("kick entropy {}.", self.id()); + self.process_virtio_queues(); + } + } } #[cfg(test)] diff --git a/src/vmm/src/devices/virtio/rng/persist.rs b/src/vmm/src/devices/virtio/rng/persist.rs index 2f2519b4962..d266e259418 100644 --- a/src/vmm/src/devices/virtio/rng/persist.rs +++ b/src/vmm/src/devices/virtio/rng/persist.rs @@ -3,12 +3,15 @@ //! Defines the structures needed for saving/restoring entropy devices. +use std::sync::Arc; + use serde::{Deserialize, Serialize}; use crate::devices::virtio::TYPE_RNG; use crate::devices::virtio::persist::{PersistError as VirtioStateError, VirtioDeviceState}; use crate::devices::virtio::queue::FIRECRACKER_MAX_QUEUE_SIZE; use crate::devices::virtio::rng::{Entropy, EntropyError, RNG_NUM_QUEUES}; +use crate::devices::virtio::transport::VirtioInterrupt; use crate::rate_limiter::RateLimiter; use crate::rate_limiter::persist::RateLimiterState; use crate::snapshot::Persist; @@ -16,17 +19,13 @@ use crate::vstate::memory::GuestMemoryMmap; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct EntropyState { - virtio_state: VirtioDeviceState, + pub virtio_state: VirtioDeviceState, rate_limiter_state: RateLimiterState, } #[derive(Debug)] -pub struct EntropyConstructorArgs(GuestMemoryMmap); - -impl EntropyConstructorArgs { - pub fn new(mem: GuestMemoryMmap) -> Self { - Self(mem) - } +pub struct EntropyConstructorArgs { + pub mem: GuestMemoryMmap, } #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -56,7 +55,7 @@ impl Persist<'_> for Entropy { state: &Self::State, ) -> Result { let queues = state.virtio_state.build_queues_checked( - &constructor_args.0, + &constructor_args.mem, TYPE_RNG, RNG_NUM_QUEUES, FIRECRACKER_MAX_QUEUE_SIZE, @@ -66,10 +65,6 @@ impl Persist<'_> for Entropy { let mut entropy = Entropy::new_with_queues(queues, rate_limiter)?; entropy.set_avail_features(state.virtio_state.avail_features); entropy.set_acked_features(state.virtio_state.acked_features); - entropy.set_irq_status(state.virtio_state.interrupt_status); - if state.virtio_state.activated { - entropy.set_activated(constructor_args.0); - } Ok(entropy) } @@ -77,11 +72,11 @@ impl Persist<'_> for Entropy { #[cfg(test)] mod tests { - use std::sync::atomic::Ordering; use super::*; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::rng::device::ENTROPY_DEV_ID; + use crate::devices::virtio::test_utils::default_interrupt; use crate::devices::virtio::test_utils::test::create_virtio_mem; use crate::snapshot::Snapshot; @@ -94,19 +89,16 @@ mod tests { let guest_mem = create_virtio_mem(); let restored = Entropy::restore( - EntropyConstructorArgs(guest_mem), + EntropyConstructorArgs { mem: guest_mem }, &Snapshot::deserialize(&mut mem.as_slice()).unwrap(), ) .unwrap(); assert_eq!(restored.device_type(), TYPE_RNG); assert_eq!(restored.id(), ENTROPY_DEV_ID); - assert_eq!(restored.is_activated(), entropy.is_activated()); + assert!(!restored.is_activated()); + assert!(!entropy.is_activated()); assert_eq!(restored.avail_features(), entropy.avail_features()); assert_eq!(restored.acked_features(), entropy.acked_features()); - assert_eq!( - restored.interrupt_status().load(Ordering::Relaxed), - entropy.interrupt_status().load(Ordering::Relaxed) - ); } } diff --git a/src/vmm/src/devices/virtio/test_utils.rs b/src/vmm/src/devices/virtio/test_utils.rs index 8642d0a85f4..861394c1c7d 100644 --- a/src/vmm/src/devices/virtio/test_utils.rs +++ b/src/vmm/src/devices/virtio/test_utils.rs @@ -6,9 +6,12 @@ use std::fmt::Debug; use std::marker::PhantomData; use std::mem; +use std::sync::Arc; use std::sync::atomic::{AtomicUsize, Ordering}; use crate::devices::virtio::queue::Queue; +use crate::devices::virtio::transport::VirtioInterrupt; +use crate::devices::virtio::transport::mmio::IrqTrigger; use crate::test_utils::single_region_mem; use crate::utils::{align_up, u64_to_usize}; use crate::vstate::memory::{Address, Bytes, GuestAddress, GuestMemoryMmap}; @@ -28,6 +31,11 @@ pub fn default_mem() -> GuestMemoryMmap { single_region_mem(0x10000) } +/// Creates a default ['IrqTrigger'] interrupt for a VirtIO device. +pub fn default_interrupt() -> Arc { + Arc::new(IrqTrigger::new()) +} + #[derive(Debug)] pub struct InputData { pub data: Vec, @@ -323,7 +331,7 @@ pub(crate) mod test { use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::net::MAX_BUFFER_SIZE; use crate::devices::virtio::queue::{Queue, VIRTQ_DESC_F_NEXT}; - use crate::devices::virtio::test_utils::{VirtQueue, VirtqDesc}; + use crate::devices::virtio::test_utils::{VirtQueue, VirtqDesc, default_interrupt}; use crate::test_utils::single_region_mem; use crate::vstate::memory::{Address, GuestAddress, GuestMemoryMmap}; @@ -414,7 +422,12 @@ pub(crate) mod test { /// Activate the device pub fn activate_device(&mut self, mem: &'a GuestMemoryMmap) { - self.device.lock().unwrap().activate(mem.clone()).unwrap(); + let interrupt = default_interrupt(); + self.device + .lock() + .unwrap() + .activate(mem.clone(), interrupt) + .unwrap(); // Process the activate event let ev_count = self.event_manager.run_with_timeout(100).unwrap(); assert_eq!(ev_count, 1); diff --git a/src/vmm/src/devices/virtio/mmio.rs b/src/vmm/src/devices/virtio/transport/mmio.rs similarity index 75% rename from src/vmm/src/devices/virtio/mmio.rs rename to src/vmm/src/devices/virtio/transport/mmio.rs index 4114838bdd3..4964f837aca 100644 --- a/src/vmm/src/devices/virtio/mmio.rs +++ b/src/vmm/src/devices/virtio/transport/mmio.rs @@ -7,9 +7,12 @@ use std::fmt::Debug; use std::sync::atomic::{AtomicU32, Ordering}; -use std::sync::{Arc, Mutex, MutexGuard}; +use std::sync::{Arc, Barrier, Mutex, MutexGuard}; -use crate::devices::virtio::device::{IrqType, VirtioDevice}; +use vmm_sys_util::eventfd::EventFd; + +use super::{VirtioInterrupt, VirtioInterruptType}; +use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::device_status; use crate::devices::virtio::queue::Queue; use crate::logger::{error, warn}; @@ -44,7 +47,7 @@ const MMIO_VERSION: u32 = 2; /// /// Typically one page (4096 bytes) of MMIO address space is sufficient to handle this transport /// and inner virtio device. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct MmioTransport { device: Arc>, // The register where feature bits are stored. @@ -55,7 +58,7 @@ pub struct MmioTransport { pub(crate) device_status: u32, pub(crate) config_generation: u32, mem: GuestMemoryMmap, - pub(crate) interrupt_status: Arc, + pub(crate) interrupt: Arc, pub is_vhost_user: bool, } @@ -63,11 +66,10 @@ impl MmioTransport { /// Constructs a new MMIO transport for the given virtio device. pub fn new( mem: GuestMemoryMmap, + interrupt: Arc, device: Arc>, is_vhost_user: bool, ) -> MmioTransport { - let interrupt_status = device.lock().expect("Poisoned lock").interrupt_status(); - MmioTransport { device, features_select: 0, @@ -76,7 +78,7 @@ impl MmioTransport { device_status: device_status::INIT, config_generation: 0, mem, - interrupt_status, + interrupt, is_vhost_user, } } @@ -144,7 +146,7 @@ impl MmioTransport { self.features_select = 0; self.acked_features_select = 0; self.queue_select = 0; - self.interrupt_status.store(0, Ordering::SeqCst); + self.interrupt.irq_status.store(0, Ordering::SeqCst); self.device_status = device_status::INIT; // . Keep interrupt_evt and queue_evts as is. There may be pending notifications in those // eventfds, but nothing will happen other than supurious wakeups. @@ -177,19 +179,18 @@ impl MmioTransport { } DRIVER_OK if self.device_status == (ACKNOWLEDGE | DRIVER | FEATURES_OK) => { self.device_status = status; - let device_activated = self.locked_device().is_activated(); + let mut locked_device = self.device.lock().expect("Poisoned lock"); + let device_activated = locked_device.is_activated(); if !device_activated { // temporary variable needed for borrow checker - let activate_result = self.locked_device().activate(self.mem.clone()); + let activate_result = + locked_device.activate(self.mem.clone(), self.interrupt.clone()); if let Err(err) = activate_result { self.device_status |= DEVICE_NEEDS_RESET; // Section 2.1.2 of the specification states that we need to send a device // configuration change interrupt - let _ = self - .locked_device() - .interrupt_trigger() - .trigger_irq(IrqType::Config); + let _ = self.interrupt.trigger(VirtioInterruptType::Config); error!("Failed to activate virtio device: {}", err) } @@ -200,16 +201,19 @@ impl MmioTransport { self.device_status |= FAILED; } _ if status == 0 => { - if self.locked_device().is_activated() { - let mut device_status = self.device_status; - let reset_result = self.locked_device().reset(); - match reset_result { - Some((_interrupt_evt, mut _queue_evts)) => {} - None => { - device_status |= FAILED; + { + let mut locked_device = self.device.lock().expect("Poisoned lock"); + if locked_device.is_activated() { + let mut device_status = self.device_status; + let reset_result = locked_device.reset(); + match reset_result { + Some((_interrupt_evt, mut _queue_evts)) => {} + None => { + device_status |= FAILED; + } } + self.device_status = device_status; } - self.device_status = device_status; } // If the backend device driver doesn't support reset, @@ -228,8 +232,8 @@ impl MmioTransport { } } -impl MmioTransport { - pub fn bus_read(&mut self, offset: u64, data: &mut [u8]) { +impl vm_device::BusDevice for MmioTransport { + fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) { match offset { 0x00..=0xff if data.len() == 4 => { let v = match offset { @@ -263,7 +267,7 @@ impl MmioTransport { // `VIRTIO_MMIO_INT_CONFIG` or not to understand if we need to send // `VIRTIO_MMIO_INT_CONFIG` or // `VIRTIO_MMIO_INT_VRING`. - let is = self.interrupt_status.load(Ordering::SeqCst); + let is = self.interrupt.irq_status.load(Ordering::SeqCst); if !self.is_vhost_user { is } else if is == VIRTIO_MMIO_INT_CONFIG { @@ -283,12 +287,15 @@ impl MmioTransport { } 0x100..=0xfff => self.locked_device().read_config(offset - 0x100, data), _ => { - warn!("invalid virtio mmio read: {:#x}:{:#x}", offset, data.len()); + warn!( + "invalid virtio mmio read: {base:#x}:{offset:#x}:{:#x}", + data.len() + ); } }; } - pub fn bus_write(&mut self, offset: u64, data: &[u8]) { + fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option> { fn hi(v: &mut GuestAddress, x: u32) { *v = (*v & 0xffff_ffff) | (u64::from(x) << 32) } @@ -324,7 +331,7 @@ impl MmioTransport { 0x44 => self.update_queue_field(|q| q.ready = v == 1), 0x64 => { if self.check_device_status(device_status::DRIVER_OK, 0) { - self.interrupt_status.fetch_and(!v, Ordering::SeqCst); + self.interrupt.irq_status.fetch_and(!v, Ordering::SeqCst); } } 0x70 => self.set_device_status(v), @@ -350,19 +357,124 @@ impl MmioTransport { } } _ => { - warn!("invalid virtio mmio write: {:#x}:{:#x}", offset, data.len()); + warn!( + "invalid virtio mmio write: {base:#x}:{offset:#x}:{:#x}", + data.len() + ); } } + None + } +} + +/// The 2 types of interrupt sources in MMIO transport. +#[derive(Debug)] +pub enum IrqType { + /// Interrupt triggered by change in config. + Config, + /// Interrupt triggered by used vring buffers. + Vring, +} + +impl From for IrqType { + fn from(interrupt_type: VirtioInterruptType) -> Self { + match interrupt_type { + VirtioInterruptType::Config => IrqType::Config, + VirtioInterruptType::Queue(_) => IrqType::Vring, + } + } +} + +/// Helper struct that is responsible for triggering guest IRQs +#[derive(Debug)] +pub struct IrqTrigger { + pub(crate) irq_status: Arc, + pub(crate) irq_evt: EventFd, +} + +impl Default for IrqTrigger { + fn default() -> Self { + Self::new() + } +} + +impl VirtioInterrupt for IrqTrigger { + fn trigger(&self, interrupt_type: VirtioInterruptType) -> Result<(), std::io::Error> { + match interrupt_type { + VirtioInterruptType::Config => self.trigger_irq(IrqType::Config), + VirtioInterruptType::Queue(_) => self.trigger_irq(IrqType::Vring), + } + } + + fn trigger_queues(&self, queues: &[u16]) -> Result<(), std::io::Error> { + if queues.is_empty() { + Ok(()) + } else { + self.trigger_irq(IrqType::Vring) + } + } + + fn notifier(&self, _interrupt_type: VirtioInterruptType) -> Option<&EventFd> { + Some(&self.irq_evt) + } + + fn status(&self) -> Arc { + self.irq_status.clone() + } + + #[cfg(test)] + fn has_pending_interrupt(&self, interrupt_type: VirtioInterruptType) -> bool { + if let Ok(num_irqs) = self.irq_evt.read() { + if num_irqs == 0 { + return false; + } + + let irq_status = self.irq_status.load(Ordering::SeqCst); + return matches!( + (irq_status, interrupt_type.into()), + (VIRTIO_MMIO_INT_CONFIG, IrqType::Config) | (VIRTIO_MMIO_INT_VRING, IrqType::Vring) + ); + } + + false + } +} + +impl IrqTrigger { + pub fn new() -> Self { + Self { + irq_status: Arc::new(AtomicU32::new(0)), + irq_evt: EventFd::new(libc::EFD_NONBLOCK) + .expect("Could not create EventFd for IrqTrigger"), + } + } + + fn trigger_irq(&self, irq_type: IrqType) -> Result<(), std::io::Error> { + let irq = match irq_type { + IrqType::Config => VIRTIO_MMIO_INT_CONFIG, + IrqType::Vring => VIRTIO_MMIO_INT_VRING, + }; + self.irq_status.fetch_or(irq, Ordering::SeqCst); + + self.irq_evt.write(1).map_err(|err| { + error!("Failed to send irq to the guest: {:?}", err); + err + })?; + + Ok(()) } } #[cfg(test)] pub(crate) mod tests { + + use std::ops::Deref; + + use vm_device::BusDevice; use vmm_sys_util::eventfd::EventFd; use super::*; use crate::devices::virtio::ActivateError; - use crate::devices::virtio::device::IrqTrigger; use crate::devices::virtio::device_status::DEVICE_NEEDS_RESET; use crate::test_utils::single_region_mem; use crate::utils::byte_order::{read_le_u32, write_le_u32}; @@ -373,7 +485,7 @@ pub(crate) mod tests { pub(crate) struct DummyDevice { acked_features: u64, avail_features: u64, - interrupt_trigger: IrqTrigger, + interrupt_trigger: Option>, queue_evts: Vec, queues: Vec, device_activated: bool, @@ -386,7 +498,7 @@ pub(crate) mod tests { DummyDevice { acked_features: 0, avail_features: 0, - interrupt_trigger: IrqTrigger::new().unwrap(), + interrupt_trigger: None, queue_evts: vec![ EventFd::new(libc::EFD_NONBLOCK).unwrap(), EventFd::new(libc::EFD_NONBLOCK).unwrap(), @@ -432,8 +544,11 @@ pub(crate) mod tests { &self.queue_evts } - fn interrupt_trigger(&self) -> &IrqTrigger { - &self.interrupt_trigger + fn interrupt_trigger(&self) -> &dyn VirtioInterrupt { + self.interrupt_trigger + .as_ref() + .expect("Device is not activated") + .deref() } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -446,8 +561,13 @@ pub(crate) mod tests { } } - fn activate(&mut self, _: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate( + &mut self, + _: GuestMemoryMmap, + interrupt: Arc, + ) -> Result<(), ActivateError> { self.device_activated = true; + self.interrupt_trigger = Some(interrupt); if self.activate_should_error { Err(ActivateError::EventFd) } else { @@ -463,16 +583,17 @@ pub(crate) mod tests { fn set_device_status(d: &mut MmioTransport, status: u32) { let mut buf = [0; 4]; write_le_u32(&mut buf[..], status); - d.bus_write(0x70, &buf[..]); + d.write(0x0, 0x70, &buf[..]); } #[test] fn test_new() { let m = single_region_mem(0x1000); + let interrupt = Arc::new(IrqTrigger::new()); let mut dummy = DummyDevice::new(); // Validate reset is no-op. assert!(dummy.reset().is_none()); - let mut d = MmioTransport::new(m, Arc::new(Mutex::new(dummy)), false); + let mut d = MmioTransport::new(m, interrupt, Arc::new(Mutex::new(dummy)), false); // We just make sure here that the implementation of a mmio device behaves as we expect, // given a known virtio device implementation (the dummy device). @@ -497,14 +618,20 @@ pub(crate) mod tests { #[test] fn test_bus_device_read() { let m = single_region_mem(0x1000); - let mut d = MmioTransport::new(m, Arc::new(Mutex::new(DummyDevice::new())), false); + let interrupt = Arc::new(IrqTrigger::new()); + let mut d = MmioTransport::new( + m, + interrupt, + Arc::new(Mutex::new(DummyDevice::new())), + false, + ); let mut buf = vec![0xff, 0, 0xfe, 0]; let buf_copy = buf.to_vec(); // The following read shouldn't be valid, because the length of the buf is not 4. buf.push(0); - d.bus_read(0, &mut buf[..]); + d.read(0x0, 0, &mut buf[..]); assert_eq!(buf[..4], buf_copy[..]); // the length is ok again @@ -512,73 +639,74 @@ pub(crate) mod tests { // Now we test that reading at various predefined offsets works as intended. - d.bus_read(0, &mut buf[..]); + d.read(0x0, 0, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), MMIO_MAGIC_VALUE); - d.bus_read(0x04, &mut buf[..]); + d.read(0x0, 0x04, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), MMIO_VERSION); - d.bus_read(0x08, &mut buf[..]); + d.read(0x0, 0x08, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), d.locked_device().device_type()); - d.bus_read(0x0c, &mut buf[..]); + d.read(0x0, 0x0c, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), VENDOR_ID); d.features_select = 0; - d.bus_read(0x10, &mut buf[..]); + d.read(0x0, 0x10, &mut buf[..]); assert_eq!( read_le_u32(&buf[..]), d.locked_device().avail_features_by_page(0) ); d.features_select = 1; - d.bus_read(0x10, &mut buf[..]); + d.read(0x0, 0x10, &mut buf[..]); assert_eq!( read_le_u32(&buf[..]), d.locked_device().avail_features_by_page(0) | 0x1 ); - d.bus_read(0x34, &mut buf[..]); + d.read(0x0, 0x34, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), 16); - d.bus_read(0x44, &mut buf[..]); + d.read(0x0, 0x44, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), u32::from(false)); - d.interrupt_status.store(111, Ordering::SeqCst); - d.bus_read(0x60, &mut buf[..]); + d.interrupt.irq_status.store(111, Ordering::SeqCst); + d.read(0x0, 0x60, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), 111); d.is_vhost_user = true; - d.interrupt_status.store(0, Ordering::SeqCst); - d.bus_read(0x60, &mut buf[..]); + d.interrupt.status().store(0, Ordering::SeqCst); + d.read(0x0, 0x60, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), VIRTIO_MMIO_INT_VRING); d.is_vhost_user = true; - d.interrupt_status + d.interrupt + .irq_status .store(VIRTIO_MMIO_INT_CONFIG, Ordering::SeqCst); - d.bus_read(0x60, &mut buf[..]); + d.read(0x0, 0x60, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), VIRTIO_MMIO_INT_CONFIG); - d.bus_read(0x70, &mut buf[..]); + d.read(0x0, 0x70, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), 0); d.config_generation = 5; - d.bus_read(0xfc, &mut buf[..]); + d.read(0x0, 0xfc, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), 5); // This read shouldn't do anything, as it's past the readable generic registers, and // before the device specific configuration space. Btw, reads from the device specific // conf space are going to be tested a bit later, alongside writes. buf = buf_copy.to_vec(); - d.bus_read(0xfd, &mut buf[..]); + d.read(0x0, 0xfd, &mut buf[..]); assert_eq!(buf[..], buf_copy[..]); // Read from an invalid address in generic register range. - d.bus_read(0xfb, &mut buf[..]); + d.read(0x0, 0xfb, &mut buf[..]); assert_eq!(buf[..], buf_copy[..]); // Read from an invalid length in generic register range. - d.bus_read(0xfc, &mut buf[..3]); + d.read(0x0, 0xfc, &mut buf[..3]); assert_eq!(buf[..], buf_copy[..]); } @@ -586,14 +714,15 @@ pub(crate) mod tests { #[allow(clippy::cognitive_complexity)] fn test_bus_device_write() { let m = single_region_mem(0x1000); + let interrupt = Arc::new(IrqTrigger::new()); let dummy_dev = Arc::new(Mutex::new(DummyDevice::new())); - let mut d = MmioTransport::new(m, dummy_dev.clone(), false); + let mut d = MmioTransport::new(m, interrupt, dummy_dev.clone(), false); let mut buf = vec![0; 5]; write_le_u32(&mut buf[..4], 1); // Nothing should happen, because the slice len > 4. d.features_select = 0; - d.bus_write(0x14, &buf[..]); + d.write(0x0, 0x14, &buf[..]); assert_eq!(d.features_select, 0); buf.pop(); @@ -605,7 +734,7 @@ pub(crate) mod tests { assert_eq!(d.locked_device().acked_features(), 0x0); d.acked_features_select = 0x0; write_le_u32(&mut buf[..], 1); - d.bus_write(0x20, &buf[..]); + d.write(0x0, 0x20, &buf[..]); assert_eq!(d.locked_device().acked_features(), 0x0); // Write to device specific configuration space should be ignored before setting @@ -614,8 +743,8 @@ pub(crate) mod tests { for i in (0..0xeff).rev() { let mut buf2 = vec![0; 0xeff]; - d.bus_write(0x100 + i as u64, &buf1[i..]); - d.bus_read(0x100, &mut buf2[..]); + d.write(0x0, 0x100 + i as u64, &buf1[i..]); + d.read(0x0, 0x100, &mut buf2[..]); for item in buf2.iter().take(0xeff) { assert_eq!(*item, 0); @@ -631,7 +760,7 @@ pub(crate) mod tests { // now writes should work d.features_select = 0; write_le_u32(&mut buf[..], 1); - d.bus_write(0x14, &buf[..]); + d.write(0x0, 0x14, &buf[..]); assert_eq!(d.features_select, 1); // Test acknowledging features on bus. @@ -640,12 +769,12 @@ pub(crate) mod tests { // Set the device available features in order to make acknowledging possible. dummy_dev.lock().unwrap().set_avail_features(0x124); - d.bus_write(0x20, &buf[..]); + d.write(0x0, 0x20, &buf[..]); assert_eq!(d.locked_device().acked_features(), 0x124); d.acked_features_select = 0; write_le_u32(&mut buf[..], 2); - d.bus_write(0x24, &buf[..]); + d.write(0x0, 0x24, &buf[..]); assert_eq!(d.acked_features_select, 2); set_device_status( &mut d, @@ -656,31 +785,31 @@ pub(crate) mod tests { assert_eq!(d.locked_device().acked_features(), 0x124); d.acked_features_select = 0x0; write_le_u32(&mut buf[..], 1); - d.bus_write(0x20, &buf[..]); + d.write(0x0, 0x20, &buf[..]); assert_eq!(d.locked_device().acked_features(), 0x124); // Setup queues d.queue_select = 0; write_le_u32(&mut buf[..], 3); - d.bus_write(0x30, &buf[..]); + d.write(0x0, 0x30, &buf[..]); assert_eq!(d.queue_select, 3); d.queue_select = 0; - assert_eq!(d.locked_device().queues()[0].size, 0); + assert_eq!(d.locked_device().queues()[0].size, 16); write_le_u32(&mut buf[..], 16); - d.bus_write(0x38, &buf[..]); + d.write(0x0, 0x38, &buf[..]); assert_eq!(d.locked_device().queues()[0].size, 16); assert!(!d.locked_device().queues()[0].ready); write_le_u32(&mut buf[..], 1); - d.bus_write(0x44, &buf[..]); + d.write(0x0, 0x44, &buf[..]); assert!(d.locked_device().queues()[0].ready); assert_eq!(d.locked_device().queues()[0].desc_table_address.0, 0); write_le_u32(&mut buf[..], 123); - d.bus_write(0x80, &buf[..]); + d.write(0x0, 0x80, &buf[..]); assert_eq!(d.locked_device().queues()[0].desc_table_address.0, 123); - d.bus_write(0x84, &buf[..]); + d.write(0x0, 0x84, &buf[..]); assert_eq!( d.locked_device().queues()[0].desc_table_address.0, 123 + (123 << 32) @@ -688,9 +817,9 @@ pub(crate) mod tests { assert_eq!(d.locked_device().queues()[0].avail_ring_address.0, 0); write_le_u32(&mut buf[..], 124); - d.bus_write(0x90, &buf[..]); + d.write(0x0, 0x90, &buf[..]); assert_eq!(d.locked_device().queues()[0].avail_ring_address.0, 124); - d.bus_write(0x94, &buf[..]); + d.write(0x0, 0x94, &buf[..]); assert_eq!( d.locked_device().queues()[0].avail_ring_address.0, 124 + (124 << 32) @@ -698,9 +827,9 @@ pub(crate) mod tests { assert_eq!(d.locked_device().queues()[0].used_ring_address.0, 0); write_le_u32(&mut buf[..], 125); - d.bus_write(0xa0, &buf[..]); + d.write(0x0, 0xa0, &buf[..]); assert_eq!(d.locked_device().queues()[0].used_ring_address.0, 125); - d.bus_write(0xa4, &buf[..]); + d.write(0x0, 0xa4, &buf[..]); assert_eq!( d.locked_device().queues()[0].used_ring_address.0, 125 + (125 << 32) @@ -714,19 +843,19 @@ pub(crate) mod tests { | device_status::DRIVER_OK, ); - d.interrupt_status.store(0b10_1010, Ordering::Relaxed); + d.interrupt.irq_status.store(0b10_1010, Ordering::Relaxed); write_le_u32(&mut buf[..], 0b111); - d.bus_write(0x64, &buf[..]); - assert_eq!(d.interrupt_status.load(Ordering::Relaxed), 0b10_1000); + d.write(0x0, 0x64, &buf[..]); + assert_eq!(d.interrupt.irq_status.load(Ordering::Relaxed), 0b10_1000); // Write to an invalid address in generic register range. write_le_u32(&mut buf[..], 0xf); d.config_generation = 0; - d.bus_write(0xfb, &buf[..]); + d.write(0x0, 0xfb, &buf[..]); assert_eq!(d.config_generation, 0); // Write to an invalid length in generic register range. - d.bus_write(0xfc, &buf[..2]); + d.write(0x0, 0xfc, &buf[..2]); assert_eq!(d.config_generation, 0); // Here we test writes/read into/from the device specific configuration space. @@ -734,8 +863,8 @@ pub(crate) mod tests { for i in (0..0xeff).rev() { let mut buf2 = vec![0; 0xeff]; - d.bus_write(0x100 + i as u64, &buf1[i..]); - d.bus_read(0x100, &mut buf2[..]); + d.write(0x0, 0x100 + i as u64, &buf1[i..]); + d.read(0x0, 0x100, &mut buf2[..]); for item in buf2.iter().take(i) { assert_eq!(*item, 0); @@ -748,7 +877,13 @@ pub(crate) mod tests { #[test] fn test_bus_device_activate() { let m = single_region_mem(0x1000); - let mut d = MmioTransport::new(m, Arc::new(Mutex::new(DummyDevice::new())), false); + let interrupt = Arc::new(IrqTrigger::new()); + let mut d = MmioTransport::new( + m, + interrupt, + Arc::new(Mutex::new(DummyDevice::new())), + false, + ); assert!(!d.locked_device().is_activated()); assert_eq!(d.device_status, device_status::INIT); @@ -784,17 +919,17 @@ pub(crate) mod tests { for q in 0..queue_len { d.queue_select = q.try_into().unwrap(); write_le_u32(&mut buf[..], 16); - d.bus_write(0x38, &buf[..]); + d.write(0x0, 0x38, &buf[..]); write_le_u32(&mut buf[..], 1); - d.bus_write(0x44, &buf[..]); + d.write(0x0, 0x44, &buf[..]); } assert!(!d.locked_device().is_activated()); // Device should be ready for activation now. // A couple of invalid writes; will trigger warnings; shouldn't activate the device. - d.bus_write(0xa8, &buf[..]); - d.bus_write(0x1000, &buf[..]); + d.write(0x0, 0xa8, &buf[..]); + d.write(0x0, 0x1000, &buf[..]); assert!(!d.locked_device().is_activated()); set_device_status( @@ -817,19 +952,20 @@ pub(crate) mod tests { // a warning path and have no effect on queue state. write_le_u32(&mut buf[..], 0); d.queue_select = 0; - d.bus_write(0x44, &buf[..]); - d.bus_read(0x44, &mut buf[..]); + d.write(0x0, 0x44, &buf[..]); + d.read(0x0, 0x44, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), 1); } #[test] fn test_bus_device_activate_failure() { let m = single_region_mem(0x1000); + let interrupt = Arc::new(IrqTrigger::new()); let device = DummyDevice { activate_should_error: true, ..DummyDevice::new() }; - let mut d = MmioTransport::new(m, Arc::new(Mutex::new(device)), false); + let mut d = MmioTransport::new(m, interrupt, Arc::new(Mutex::new(device)), false); set_device_status(&mut d, device_status::ACKNOWLEDGE); set_device_status(&mut d, device_status::ACKNOWLEDGE | device_status::DRIVER); @@ -843,14 +979,11 @@ pub(crate) mod tests { for q in 0..queue_len { d.queue_select = q.try_into().unwrap(); write_le_u32(&mut buf[..], 16); - d.bus_write(0x38, &buf[..]); + d.write(0x0, 0x38, &buf[..]); write_le_u32(&mut buf[..], 1); - d.bus_write(0x44, &buf[..]); + d.write(0x0, 0x44, &buf[..]); } - assert_eq!( - d.locked_device().interrupt_status().load(Ordering::SeqCst), - 0 - ); + assert!(!d.locked_device().is_activated()); set_device_status( &mut d, @@ -871,7 +1004,8 @@ pub(crate) mod tests { assert_eq!( d.locked_device() .interrupt_trigger() - .irq_evt + .notifier(VirtioInterruptType::Config) + .unwrap() .read() .unwrap(), 1 @@ -892,9 +1026,9 @@ pub(crate) mod tests { for q in 0..queues_count { d.queue_select = q.try_into().unwrap(); write_le_u32(&mut buf[..], 16); - d.bus_write(0x38, &buf[..]); + d.write(0x0, 0x38, &buf[..]); write_le_u32(&mut buf[..], 1); - d.bus_write(0x44, &buf[..]); + d.write(0x0, 0x44, &buf[..]); } assert!(!d.locked_device().is_activated()); @@ -919,7 +1053,13 @@ pub(crate) mod tests { #[test] fn test_bus_device_reset() { let m = single_region_mem(0x1000); - let mut d = MmioTransport::new(m, Arc::new(Mutex::new(DummyDevice::new())), false); + let interrupt = Arc::new(IrqTrigger::new()); + let mut d = MmioTransport::new( + m, + interrupt, + Arc::new(Mutex::new(DummyDevice::new())), + false, + ); let mut buf = [0; 4]; assert!(!d.locked_device().is_activated()); @@ -928,13 +1068,13 @@ pub(crate) mod tests { // Marking device as FAILED should not affect device_activated state write_le_u32(&mut buf[..], 0x8f); - d.bus_write(0x70, &buf[..]); + d.write(0x0, 0x70, &buf[..]); assert_eq!(d.device_status, 0x8f); assert!(d.locked_device().is_activated()); // Nothing happens when backend driver doesn't support reset write_le_u32(&mut buf[..], 0x0); - d.bus_write(0x70, &buf[..]); + d.write(0x0, 0x70, &buf[..]); assert_eq!(d.device_status, 0x8f); assert!(d.locked_device().is_activated()); } @@ -968,4 +1108,30 @@ pub(crate) mod tests { dummy_dev.ack_features_by_page(0, 8); assert_eq!(dummy_dev.acked_features(), 24); } + + #[test] + fn irq_trigger() { + let irq_trigger = IrqTrigger::new(); + assert_eq!(irq_trigger.irq_status.load(Ordering::SeqCst), 0); + + // Check that there are no pending irqs. + assert!(!irq_trigger.has_pending_interrupt(VirtioInterruptType::Config)); + assert!(!irq_trigger.has_pending_interrupt(VirtioInterruptType::Queue(0))); + + // Check that trigger_irq() correctly generates irqs. + irq_trigger.trigger(VirtioInterruptType::Config).unwrap(); + assert!(irq_trigger.has_pending_interrupt(VirtioInterruptType::Config)); + irq_trigger.irq_status.store(0, Ordering::SeqCst); + irq_trigger.trigger(VirtioInterruptType::Queue(0)).unwrap(); + assert!(irq_trigger.has_pending_interrupt(VirtioInterruptType::Queue(0))); + + // Check trigger_irq() failure case (irq_evt is full). + irq_trigger.irq_evt.write(u64::MAX - 1).unwrap(); + irq_trigger + .trigger(VirtioInterruptType::Config) + .unwrap_err(); + irq_trigger + .trigger(VirtioInterruptType::Queue(0)) + .unwrap_err(); + } } diff --git a/src/vmm/src/devices/virtio/transport/mod.rs b/src/vmm/src/devices/virtio/transport/mod.rs new file mode 100644 index 00000000000..39dfe05a4fd --- /dev/null +++ b/src/vmm/src/devices/virtio/transport/mod.rs @@ -0,0 +1,51 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::sync::Arc; +use std::sync::atomic::AtomicU32; + +use vmm_sys_util::eventfd::EventFd; + +/// MMIO transport for VirtIO devices +pub mod mmio; +/// PCI transport for VirtIO devices +pub mod pci; + +/// Represents the types of interrupts used by VirtIO devices +#[derive(Debug, Clone)] +pub enum VirtioInterruptType { + /// Interrupt for VirtIO configuration changes + Config, + /// Interrupts for new events in a queue. + Queue(u16), +} + +/// API of interrupt types used by VirtIO devices +pub trait VirtioInterrupt: std::fmt::Debug + Send + Sync { + /// Trigger a VirtIO interrupt. + fn trigger(&self, interrupt_type: VirtioInterruptType) -> Result<(), std::io::Error>; + + /// Trigger multiple Virtio interrupts for selected queues. + /// The caller needs to ensure that [`queues`] does not include duplicate entries to + /// avoid sending multiple interrupts for the same queue. + /// This is to allow sending a single interrupt for implementations that don't + /// distinguish different queues, like IrqTrigger, instead of sending multiple same + /// interrupts. + fn trigger_queues(&self, queues: &[u16]) -> Result<(), std::io::Error> { + queues + .iter() + .try_for_each(|&qidx| self.trigger(VirtioInterruptType::Queue(qidx))) + } + + /// Get the `EventFd` (if any) that backs the underlying interrupt. + fn notifier(&self, _interrupt_type: VirtioInterruptType) -> Option<&EventFd> { + None + } + + /// Get the current device interrupt status. + fn status(&self) -> Arc; + + /// Returns true if there is any pending interrupt + #[cfg(test)] + fn has_pending_interrupt(&self, interrupt_type: VirtioInterruptType) -> bool; +} diff --git a/src/vmm/src/devices/virtio/transport/pci/common_config.rs b/src/vmm/src/devices/virtio/transport/pci/common_config.rs new file mode 100644 index 00000000000..d353b04c43e --- /dev/null +++ b/src/vmm/src/devices/virtio/transport/pci/common_config.rs @@ -0,0 +1,471 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +use std::sync::atomic::{AtomicU16, Ordering}; +use std::sync::{Arc, Mutex}; + +use byteorder::{ByteOrder, LittleEndian}; +use serde::{Deserialize, Serialize}; +use vm_memory::GuestAddress; + +use crate::devices::virtio::device::VirtioDevice; +use crate::devices::virtio::queue::Queue; +use crate::devices::virtio::transport::pci::device::VIRTQ_MSI_NO_VECTOR; +use crate::logger::{debug, error, info, trace, warn}; +pub const VIRTIO_PCI_COMMON_CONFIG_ID: &str = "virtio_pci_common_config"; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VirtioPciCommonConfigState { + pub driver_status: u8, + pub config_generation: u8, + pub device_feature_select: u32, + pub driver_feature_select: u32, + pub queue_select: u16, + pub msix_config: u16, + pub msix_queues: Vec, +} + +// The standard layout for the ring is a continuous chunk of memory which looks +// like this. We assume num is a power of 2. +// +// struct vring +// { +// // The actual descriptors (16 bytes each) +// struct vring_desc desc[num]; +// +// // A ring of available descriptor heads with free-running index. +// __virtio16 avail_flags; +// __virtio16 avail_idx; +// __virtio16 available[num]; +// __virtio16 used_event_idx; +// +// // Padding to the next align boundary. +// char pad[]; +// +// // A ring of used descriptor heads with free-running index. +// __virtio16 used_flags; +// __virtio16 used_idx; +// struct vring_used_elem used[num]; +// __virtio16 avail_event_idx; +// }; +// struct vring_desc { +// __virtio64 addr; +// __virtio32 len; +// __virtio16 flags; +// __virtio16 next; +// }; +// +// struct vring_avail { +// __virtio16 flags; +// __virtio16 idx; +// __virtio16 ring[]; +// }; +// +// // u32 is used here for ids for padding reasons. +// struct vring_used_elem { +// // Index of start of used descriptor chain. +// __virtio32 id; +// // Total length of the descriptor chain which was used (written to) +// __virtio32 len; +// }; +// +// Kernel header used for this reference: include/uapi/linux/virtio_ring.h +// Virtio Spec: https://docs.oasis-open.org/virtio/virtio/v1.2/csd01/virtio-v1.2-csd01.html +// +const VRING_DESC_ELEMENT_SIZE: usize = 16; +const VRING_AVAIL_ELEMENT_SIZE: usize = 2; +const VRING_USED_ELEMENT_SIZE: usize = 8; +#[derive(Debug)] +pub enum VringType { + Desc, + Avail, + Used, +} + +pub fn get_vring_size(t: VringType, queue_size: u16) -> u64 { + let (length_except_ring, element_size) = match t { + VringType::Desc => (0, VRING_DESC_ELEMENT_SIZE), + VringType::Avail => (6, VRING_AVAIL_ELEMENT_SIZE), + VringType::Used => (6, VRING_USED_ELEMENT_SIZE), + }; + (length_except_ring + element_size * queue_size as usize) as u64 +} + +/// Contains the data for reading and writing the common configuration structure of a virtio PCI +/// device. +/// +/// * Registers: +/// +/// ** About the whole device. +/// le32 device_feature_select; // 0x00 // read-write +/// le32 device_feature; // 0x04 // read-only for driver +/// le32 driver_feature_select; // 0x08 // read-write +/// le32 driver_feature; // 0x0C // read-write +/// le16 msix_config; // 0x10 // read-write +/// le16 num_queues; // 0x12 // read-only for driver +/// u8 device_status; // 0x14 // read-write (driver_status) +/// u8 config_generation; // 0x15 // read-only for driver +/// +/// ** About a specific virtqueue. +/// le16 queue_select; // 0x16 // read-write +/// le16 queue_size; // 0x18 // read-write, power of 2, or 0. +/// le16 queue_msix_vector; // 0x1A // read-write +/// le16 queue_enable; // 0x1C // read-write (Ready) +/// le16 queue_notify_off; // 0x1E // read-only for driver +/// le64 queue_desc; // 0x20 // read-write +/// le64 queue_avail; // 0x28 // read-write +/// le64 queue_used; // 0x30 // read-write +#[derive(Debug)] +pub struct VirtioPciCommonConfig { + pub driver_status: u8, + pub config_generation: u8, + pub device_feature_select: u32, + pub driver_feature_select: u32, + pub queue_select: u16, + pub msix_config: Arc, + pub msix_queues: Arc>>, +} + +impl VirtioPciCommonConfig { + pub fn new(state: VirtioPciCommonConfigState) -> Self { + VirtioPciCommonConfig { + driver_status: state.driver_status, + config_generation: state.config_generation, + device_feature_select: state.device_feature_select, + driver_feature_select: state.driver_feature_select, + queue_select: state.queue_select, + msix_config: Arc::new(AtomicU16::new(state.msix_config)), + msix_queues: Arc::new(Mutex::new(state.msix_queues)), + } + } + + pub fn state(&self) -> VirtioPciCommonConfigState { + VirtioPciCommonConfigState { + driver_status: self.driver_status, + config_generation: self.config_generation, + device_feature_select: self.device_feature_select, + driver_feature_select: self.driver_feature_select, + queue_select: self.queue_select, + msix_config: self.msix_config.load(Ordering::Acquire), + msix_queues: self.msix_queues.lock().unwrap().clone(), + } + } + + pub fn read(&mut self, offset: u64, data: &mut [u8], device: Arc>) { + assert!(data.len() <= 8); + + match data.len() { + 1 => { + let v = self.read_common_config_byte(offset); + data[0] = v; + } + 2 => { + let v = self.read_common_config_word(offset, device.lock().unwrap().queues()); + LittleEndian::write_u16(data, v); + } + 4 => { + let v = self.read_common_config_dword(offset, device); + LittleEndian::write_u32(data, v); + } + 8 => { + let v = self.read_common_config_qword(offset); + LittleEndian::write_u64(data, v); + } + _ => error!("invalid data length for virtio read: len {}", data.len()), + } + } + + pub fn write(&mut self, offset: u64, data: &[u8], device: Arc>) { + assert!(data.len() <= 8); + + match data.len() { + 1 => self.write_common_config_byte(offset, data[0]), + 2 => self.write_common_config_word( + offset, + LittleEndian::read_u16(data), + device.lock().unwrap().queues_mut(), + ), + 4 => self.write_common_config_dword(offset, LittleEndian::read_u32(data), device), + 8 => self.write_common_config_qword( + offset, + LittleEndian::read_u64(data), + device.lock().unwrap().queues_mut(), + ), + _ => error!("invalid data length for virtio write: len {}", data.len()), + } + } + + fn read_common_config_byte(&self, offset: u64) -> u8 { + debug!("read_common_config_byte: offset 0x{:x}", offset); + // The driver is only allowed to do aligned, properly sized access. + match offset { + 0x14 => self.driver_status, + 0x15 => self.config_generation, + _ => { + warn!("invalid virtio config byte read: 0x{:x}", offset); + 0 + } + } + } + + fn write_common_config_byte(&mut self, offset: u64, value: u8) { + debug!("write_common_config_byte: offset 0x{offset:x}: {value:x}"); + match offset { + 0x14 => self.driver_status = value, + _ => { + warn!("invalid virtio config byte write: 0x{:x}", offset); + } + } + } + + fn read_common_config_word(&self, offset: u64, queues: &[Queue]) -> u16 { + debug!("read_common_config_word: offset 0x{:x}", offset); + match offset { + 0x10 => self.msix_config.load(Ordering::Acquire), + 0x12 => queues.len().try_into().unwrap(), // num_queues + 0x16 => self.queue_select, + 0x18 => self.with_queue(queues, |q| q.size).unwrap_or(0), + // If `queue_select` points to an invalid queue we should return NO_VECTOR. + // Reading from here + // https://docs.oasis-open.org/virtio/virtio/v1.1/csprd01/virtio-v1.1-csprd01.html#x1-1280005: + // + // > The device MUST return vector mapped to a given event, (NO_VECTOR if unmapped) on + // > read of config_msix_vector/queue_msix_vector. + 0x1a => self + .msix_queues + .lock() + .unwrap() + .get(self.queue_select as usize) + .copied() + .unwrap_or(VIRTQ_MSI_NO_VECTOR), + 0x1c => u16::from(self.with_queue(queues, |q| q.ready).unwrap_or(false)), + 0x1e => self.queue_select, // notify_off + _ => { + warn!("invalid virtio register word read: 0x{:x}", offset); + 0 + } + } + } + + fn write_common_config_word(&mut self, offset: u64, value: u16, queues: &mut [Queue]) { + debug!("write_common_config_word: offset 0x{:x}", offset); + match offset { + 0x10 => { + // Make sure that the guest doesn't select an invalid vector. We are offering + // `num_queues + 1` vectors (plus one for configuration updates). If an invalid + // vector has been selected, we just store the `NO_VECTOR` value. + let mut msix_queues = self.msix_queues.lock().expect("Poisoned lock"); + let nr_vectors = msix_queues.len() + 1; + + if (value as usize) < nr_vectors { + self.msix_config.store(value, Ordering::Release); + } else { + self.msix_config + .store(VIRTQ_MSI_NO_VECTOR, Ordering::Release); + } + } + 0x16 => self.queue_select = value, + 0x18 => self.with_queue_mut(queues, |q| q.size = value), + 0x1a => { + let mut msix_queues = self.msix_queues.lock().expect("Poisoned lock"); + let nr_vectors = msix_queues.len() + 1; + // Make sure that `queue_select` points to a valid queue. If not, we won't do + // anything here and subsequent reads at 0x1a will return `NO_VECTOR`. + if let Some(queue) = msix_queues.get_mut(self.queue_select as usize) { + // Make sure that the guest doesn't select an invalid vector. We are offering + // `num_queues + 1` vectors (plus one for configuration updates). If an invalid + // vector has been selected, we just store the `NO_VECTOR` value. + if (value as usize) < nr_vectors { + *queue = value; + } else { + *queue = VIRTQ_MSI_NO_VECTOR; + } + } + } + 0x1c => self.with_queue_mut(queues, |q| { + q.ready = value == 1; + }), + _ => { + warn!("invalid virtio register word write: 0x{:x}", offset); + } + } + } + + fn read_common_config_dword(&self, offset: u64, device: Arc>) -> u32 { + debug!("read_common_config_dword: offset 0x{:x}", offset); + match offset { + 0x00 => self.device_feature_select, + 0x04 => { + let locked_device = device.lock().unwrap(); + // Only 64 bits of features (2 pages) are defined for now, so limit + // device_feature_select to avoid shifting by 64 or more bits. + if self.device_feature_select < 2 { + ((locked_device.avail_features() >> (self.device_feature_select * 32)) + & 0xffff_ffff) as u32 + } else { + 0 + } + } + 0x08 => self.driver_feature_select, + _ => { + warn!("invalid virtio register dword read: 0x{:x}", offset); + 0 + } + } + } + + fn write_common_config_dword( + &mut self, + offset: u64, + value: u32, + device: Arc>, + ) { + debug!("write_common_config_dword: offset 0x{:x}", offset); + fn hi(v: &mut GuestAddress, x: u32) { + *v = (*v & 0xffff_ffff) | (u64::from(x) << 32) + } + + fn lo(v: &mut GuestAddress, x: u32) { + *v = (*v & !0xffff_ffff) | u64::from(x) + } + + let mut locked_device = device.lock().unwrap(); + + match offset { + 0x00 => self.device_feature_select = value, + 0x08 => self.driver_feature_select = value, + 0x0c => locked_device.ack_features_by_page(self.driver_feature_select, value), + 0x20 => self.with_queue_mut(locked_device.queues_mut(), |q| { + lo(&mut q.desc_table_address, value) + }), + 0x24 => self.with_queue_mut(locked_device.queues_mut(), |q| { + hi(&mut q.desc_table_address, value) + }), + 0x28 => self.with_queue_mut(locked_device.queues_mut(), |q| { + lo(&mut q.avail_ring_address, value) + }), + 0x2c => self.with_queue_mut(locked_device.queues_mut(), |q| { + hi(&mut q.avail_ring_address, value) + }), + 0x30 => self.with_queue_mut(locked_device.queues_mut(), |q| { + lo(&mut q.used_ring_address, value) + }), + 0x34 => self.with_queue_mut(locked_device.queues_mut(), |q| { + hi(&mut q.used_ring_address, value) + }), + _ => { + warn!("invalid virtio register dword write: 0x{:x}", offset); + } + } + } + + fn read_common_config_qword(&self, _offset: u64) -> u64 { + debug!("read_common_config_qword: offset 0x{:x}", _offset); + 0 // Assume the guest has no reason to read write-only registers. + } + + fn write_common_config_qword(&mut self, offset: u64, value: u64, queues: &mut [Queue]) { + debug!("write_common_config_qword: offset 0x{:x}", offset); + + let low = Some((value & 0xffff_ffff) as u32); + let high = Some((value >> 32) as u32); + + match offset { + 0x20 => self.with_queue_mut(queues, |q| q.desc_table_address.0 = value), + 0x28 => self.with_queue_mut(queues, |q| q.avail_ring_address.0 = value), + 0x30 => self.with_queue_mut(queues, |q| q.used_ring_address.0 = value), + _ => { + warn!("invalid virtio register qword write: 0x{:x}", offset); + } + } + } + + fn with_queue(&self, queues: &[Queue], f: F) -> Option + where + F: FnOnce(&Queue) -> U, + { + queues.get(self.queue_select as usize).map(f) + } + + fn with_queue_mut(&self, queues: &mut [Queue], f: F) { + if let Some(queue) = queues.get_mut(self.queue_select as usize) { + f(queue); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::devices::virtio::transport::mmio::tests::DummyDevice; + + #[test] + fn write_base_regs() { + let mut regs = VirtioPciCommonConfig { + driver_status: 0xaa, + config_generation: 0x55, + device_feature_select: 0x0, + driver_feature_select: 0x0, + queue_select: 0xff, + msix_config: Arc::new(AtomicU16::new(0)), + msix_queues: Arc::new(Mutex::new(vec![0; 3])), + }; + + let dev = Arc::new(Mutex::new(DummyDevice::new())); + // Can set all bits of driver_status. + regs.write(0x14, &[0x55], dev.clone()); + let mut read_back = vec![0x00]; + regs.read(0x14, &mut read_back, dev.clone()); + assert_eq!(read_back[0], 0x55); + + // The config generation register is read only. + regs.write(0x15, &[0xaa], dev.clone()); + let mut read_back = vec![0x00]; + regs.read(0x15, &mut read_back, dev.clone()); + assert_eq!(read_back[0], 0x55); + + // Device features is read-only and passed through from the device. + regs.write(0x04, &[0, 0, 0, 0], dev.clone()); + let mut read_back = vec![0, 0, 0, 0]; + regs.read(0x04, &mut read_back, dev.clone()); + assert_eq!(LittleEndian::read_u32(&read_back), 0u32); + + // Feature select registers are read/write. + regs.write(0x00, &[1, 2, 3, 4], dev.clone()); + let mut read_back = vec![0, 0, 0, 0]; + regs.read(0x00, &mut read_back, dev.clone()); + assert_eq!(LittleEndian::read_u32(&read_back), 0x0403_0201); + regs.write(0x08, &[1, 2, 3, 4], dev.clone()); + let mut read_back = vec![0, 0, 0, 0]; + regs.read(0x08, &mut read_back, dev.clone()); + assert_eq!(LittleEndian::read_u32(&read_back), 0x0403_0201); + + // 'queue_select' can be read and written. + regs.write(0x16, &[0xaa, 0x55], dev.clone()); + let mut read_back = vec![0x00, 0x00]; + regs.read(0x16, &mut read_back, dev.clone()); + assert_eq!(read_back[0], 0xaa); + assert_eq!(read_back[1], 0x55); + + // Getting the MSI vector when `queue_select` points to an invalid queue should return + // NO_VECTOR (0xffff) + regs.read(0x1a, &mut read_back, dev.clone()); + assert_eq!(read_back, [0xff, 0xff]); + + // Writing the MSI vector of an invalid `queue_select` does not have any effect. + regs.write(0x1a, &[0x12, 0x13], dev.clone()); + assert_eq!(read_back, [0xff, 0xff]); + // Valid `queue_select` though should setup the corresponding MSI-X queue. + regs.write(0x16, &[0x1, 0x0], dev.clone()); + assert_eq!(regs.queue_select, 1); + regs.write(0x1a, &[0x1, 0x0], dev.clone()); + regs.read(0x1a, &mut read_back, dev); + assert_eq!(LittleEndian::read_u16(&read_back[..2]), 0x1); + } +} diff --git a/src/vmm/src/devices/virtio/transport/pci/device.rs b/src/vmm/src/devices/virtio/transport/pci/device.rs new file mode 100644 index 00000000000..038264bb417 --- /dev/null +++ b/src/vmm/src/devices/virtio/transport/pci/device.rs @@ -0,0 +1,1286 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +use std::cmp; +use std::collections::HashMap; +use std::fmt::{Debug, Formatter}; +use std::io::{ErrorKind, Write}; +use std::sync::atomic::{AtomicBool, AtomicU16, AtomicU32, AtomicUsize, Ordering}; +use std::sync::{Arc, Barrier, Mutex}; + +use anyhow::anyhow; +use kvm_ioctls::{IoEventAddress, NoDatamatch}; +use pci::{ + BarReprogrammingParams, MsixCap, MsixConfig, MsixConfigState, PciBarConfiguration, + PciBarRegionType, PciBdf, PciCapability, PciCapabilityId, PciClassCode, PciConfiguration, + PciConfigurationState, PciDevice, PciDeviceError, PciHeaderType, PciMassStorageSubclass, + PciNetworkControllerSubclass, PciSubclass, +}; +use serde::{Deserialize, Serialize}; +use thiserror::Error; +use vm_allocator::{AddressAllocator, AllocPolicy, RangeInclusive}; +use vm_device::interrupt::{InterruptIndex, InterruptSourceGroup, MsiIrqGroupConfig}; +use vm_device::{BusDevice, PciBarType}; +use vm_memory::{Address, ByteValued, GuestAddress, Le32}; +use vmm_sys_util::errno; +use vmm_sys_util::eventfd::EventFd; + +use crate::Vm; +use crate::devices::virtio::device::VirtioDevice; +use crate::devices::virtio::queue::Queue; +use crate::devices::virtio::transport::pci::common_config::{ + VirtioPciCommonConfig, VirtioPciCommonConfigState, +}; +use crate::devices::virtio::transport::{VirtioInterrupt, VirtioInterruptType}; +use crate::devices::virtio::{TYPE_BLOCK, TYPE_NET}; +use crate::logger::{debug, error}; +use crate::snapshot::Persist; +use crate::utils::u64_to_usize; +use crate::vstate::memory::GuestMemoryMmap; +use crate::vstate::resources::ResourceAllocator; +use crate::vstate::vm::{InterruptError, MsiVectorGroup}; + +const DEVICE_INIT: u8 = 0x00; +const DEVICE_ACKNOWLEDGE: u8 = 0x01; +const DEVICE_DRIVER: u8 = 0x02; +const DEVICE_DRIVER_OK: u8 = 0x04; +const DEVICE_FEATURES_OK: u8 = 0x08; +const DEVICE_FAILED: u8 = 0x80; + +const VIRTIO_F_RING_INDIRECT_DESC: u32 = 28; +const VIRTIO_F_RING_EVENT_IDX: u32 = 29; +const VIRTIO_F_VERSION_1: u32 = 32; +const VIRTIO_F_IOMMU_PLATFORM: u32 = 33; +const VIRTIO_F_IN_ORDER: u32 = 35; +const VIRTIO_F_ORDER_PLATFORM: u32 = 36; +#[allow(dead_code)] +const VIRTIO_F_SR_IOV: u32 = 37; +const VIRTIO_F_NOTIFICATION_DATA: u32 = 38; + +/// Vector value used to disable MSI for a queue. +pub const VIRTQ_MSI_NO_VECTOR: u16 = 0xffff; + +/// BAR index we are using for VirtIO configuration +const VIRTIO_BAR_INDEX: u8 = 0; + +enum PciCapabilityType { + Common = 1, + Notify = 2, + Isr = 3, + Device = 4, + Pci = 5, + SharedMemory = 8, +} + +// This offset represents the 2 bytes omitted from the VirtioPciCap structure +// as they are already handled through add_capability(). These 2 bytes are the +// fields cap_vndr (1 byte) and cap_next (1 byte) defined in the virtio spec. +const VIRTIO_PCI_CAP_OFFSET: usize = 2; + +#[allow(dead_code)] +#[repr(C, packed)] +#[derive(Debug, Clone, Copy, Default)] +struct VirtioPciCap { + cap_len: u8, // Generic PCI field: capability length + cfg_type: u8, // Identifies the structure. + pci_bar: u8, // Where to find it. + id: u8, // Multiple capabilities of the same type + padding: [u8; 2], // Pad to full dword. + offset: Le32, // Offset within bar. + length: Le32, // Length of the structure, in bytes. +} + +// SAFETY: All members are simple numbers and any value is valid. +unsafe impl ByteValued for VirtioPciCap {} + +impl PciCapability for VirtioPciCap { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::VendorSpecific + } +} + +const VIRTIO_PCI_CAP_LEN_OFFSET: u8 = 2; + +impl VirtioPciCap { + pub fn new(cfg_type: PciCapabilityType, offset: u32, length: u32) -> Self { + VirtioPciCap { + cap_len: u8::try_from(std::mem::size_of::()).unwrap() + + VIRTIO_PCI_CAP_LEN_OFFSET, + cfg_type: cfg_type as u8, + pci_bar: VIRTIO_BAR_INDEX, + id: 0, + padding: [0; 2], + offset: Le32::from(offset), + length: Le32::from(length), + } + } +} + +#[allow(dead_code)] +#[repr(C, packed)] +#[derive(Clone, Copy, Default)] +struct VirtioPciNotifyCap { + cap: VirtioPciCap, + notify_off_multiplier: Le32, +} +// SAFETY: All members are simple numbers and any value is valid. +unsafe impl ByteValued for VirtioPciNotifyCap {} + +impl PciCapability for VirtioPciNotifyCap { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::VendorSpecific + } +} + +impl VirtioPciNotifyCap { + pub fn new(cfg_type: PciCapabilityType, offset: u32, length: u32, multiplier: Le32) -> Self { + VirtioPciNotifyCap { + cap: VirtioPciCap { + cap_len: u8::try_from(std::mem::size_of::()).unwrap() + + VIRTIO_PCI_CAP_LEN_OFFSET, + cfg_type: cfg_type as u8, + pci_bar: VIRTIO_BAR_INDEX, + id: 0, + padding: [0; 2], + offset: Le32::from(offset), + length: Le32::from(length), + }, + notify_off_multiplier: multiplier, + } + } +} + +#[allow(dead_code)] +#[repr(C, packed)] +#[derive(Clone, Copy, Default)] +struct VirtioPciCap64 { + cap: VirtioPciCap, + offset_hi: Le32, + length_hi: Le32, +} +// SAFETY: All members are simple numbers and any value is valid. +unsafe impl ByteValued for VirtioPciCap64 {} + +impl PciCapability for VirtioPciCap64 { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::VendorSpecific + } +} + +impl VirtioPciCap64 { + pub fn new(cfg_type: PciCapabilityType, pci_bar: u8, id: u8, offset: u64, length: u64) -> Self { + VirtioPciCap64 { + cap: VirtioPciCap { + cap_len: u8::try_from(std::mem::size_of::()).unwrap() + + VIRTIO_PCI_CAP_LEN_OFFSET, + cfg_type: cfg_type as u8, + pci_bar, + id, + padding: [0; 2], + offset: Le32::from((offset & 0xffff_ffff) as u32), + length: Le32::from((length & 0xffff_ffff) as u32), + }, + offset_hi: Le32::from((offset >> 32) as u32), + length_hi: Le32::from((length >> 32) as u32), + } + } +} + +#[allow(dead_code)] +#[repr(C, packed)] +#[derive(Debug, Clone, Copy, Default)] +struct VirtioPciCfgCap { + cap: VirtioPciCap, + pci_cfg_data: [u8; 4], +} +// SAFETY: All members are simple numbers and any value is valid. +unsafe impl ByteValued for VirtioPciCfgCap {} + +impl PciCapability for VirtioPciCfgCap { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::VendorSpecific + } +} + +impl VirtioPciCfgCap { + fn new() -> Self { + VirtioPciCfgCap { + cap: VirtioPciCap::new(PciCapabilityType::Pci, 0, 0), + ..Default::default() + } + } +} + +#[derive(Debug, Clone, Copy, Default)] +struct VirtioPciCfgCapInfo { + offset: usize, + cap: VirtioPciCfgCap, +} + +#[allow(dead_code)] +#[derive(Debug, Copy, Clone)] +pub enum PciVirtioSubclass { + NonTransitionalBase = 0xff, +} + +impl PciSubclass for PciVirtioSubclass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +// Allocate one bar for the structs pointed to by the capability structures. +// As per the PCI specification, because the same BAR shares MSI-X and non +// MSI-X structures, it is recommended to use 8KiB alignment for all those +// structures. +const COMMON_CONFIG_BAR_OFFSET: u64 = 0x0000; +const COMMON_CONFIG_SIZE: u64 = 56; +const ISR_CONFIG_BAR_OFFSET: u64 = 0x2000; +const ISR_CONFIG_SIZE: u64 = 1; +const DEVICE_CONFIG_BAR_OFFSET: u64 = 0x4000; +const DEVICE_CONFIG_SIZE: u64 = 0x1000; +const NOTIFICATION_BAR_OFFSET: u64 = 0x6000; +const NOTIFICATION_SIZE: u64 = 0x1000; +const MSIX_TABLE_BAR_OFFSET: u64 = 0x8000; +// The size is 256KiB because the table can hold up to 2048 entries, with each +// entry being 128 bits (4 DWORDS). +const MSIX_TABLE_SIZE: u64 = 0x40000; +const MSIX_PBA_BAR_OFFSET: u64 = 0x48000; +// The size is 2KiB because the Pending Bit Array has one bit per vector and it +// can support up to 2048 vectors. +const MSIX_PBA_SIZE: u64 = 0x800; +// The BAR size must be a power of 2. +const CAPABILITY_BAR_SIZE: u64 = 0x80000; +const VIRTIO_COMMON_BAR_INDEX: usize = 0; +const VIRTIO_SHM_BAR_INDEX: usize = 2; + +const NOTIFY_OFF_MULTIPLIER: u32 = 4; // A dword per notification address. + +const VIRTIO_PCI_VENDOR_ID: u16 = 0x1af4; +const VIRTIO_PCI_DEVICE_ID_BASE: u16 = 0x1040; // Add to device type to get device ID. + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct QueueState { + max_size: u16, + size: u16, + ready: bool, + desc_table: u64, + avail_ring: u64, + used_ring: u64, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VirtioPciDeviceState { + pub pci_device_bdf: PciBdf, + pub device_activated: bool, + pub interrupt_status: usize, + pub cap_pci_cfg_offset: usize, + pub cap_pci_cfg: Vec, + pub pci_configuration_state: PciConfigurationState, + pub pci_dev_state: VirtioPciCommonConfigState, + pub msix_state: MsixConfigState, + pub msi_vector_group: Vec, + pub bar_configuration: PciBarConfiguration, +} + +#[derive(Debug, thiserror::Error, displaydoc::Display)] +pub enum VirtioPciDeviceError { + /// Failed creating VirtioPciDevice: {0} + CreateVirtioPciDevice(#[from] anyhow::Error), + /// Error creating MSI configuration: {0} + Msi(#[from] pci::MsixError), +} +pub type Result = std::result::Result; + +pub struct VirtioPciDevice { + id: String, + + // BDF assigned to the device + pci_device_bdf: PciBdf, + + // PCI configuration registers. + configuration: PciConfiguration, + + // virtio PCI common configuration + common_config: VirtioPciCommonConfig, + + // MSI-X config + msix_config: Option>>, + + // Number of MSI-X vectors + msix_num: u16, + + // Virtio device reference and status + device: Arc>, + device_activated: Arc, + + // PCI interrupts. + interrupt_status: Arc, + virtio_interrupt: Option>, + interrupt_source_group: Arc, + + // Guest memory + memory: GuestMemoryMmap, + + // Add a dedicated structure to hold information about the very specific + // virtio-pci capability VIRTIO_PCI_CAP_PCI_CFG. This is needed to support + // the legacy/backward compatible mechanism of letting the guest access the + // other virtio capabilities without mapping the PCI BARs. This can be + // needed when the guest tries to early access the virtio configuration of + // a device. + cap_pci_cfg_info: VirtioPciCfgCapInfo, + + // Details of BAR region + pub bar_region: PciBarConfiguration, +} + +impl Debug for VirtioPciDevice { + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + f.debug_struct("VirtioPciDevice") + .field("id", &self.id) + .finish() + } +} + +impl VirtioPciDevice { + fn pci_configuration( + virtio_device_type: u32, + msix_config: &Arc>, + pci_config_state: Option, + ) -> PciConfiguration { + let pci_device_id = VIRTIO_PCI_DEVICE_ID_BASE + u16::try_from(virtio_device_type).unwrap(); + let (class, subclass) = match virtio_device_type { + TYPE_NET => ( + PciClassCode::NetworkController, + &PciNetworkControllerSubclass::EthernetController as &dyn PciSubclass, + ), + TYPE_BLOCK => ( + PciClassCode::MassStorage, + &PciMassStorageSubclass::MassStorage as &dyn PciSubclass, + ), + _ => ( + PciClassCode::Other, + &PciVirtioSubclass::NonTransitionalBase as &dyn PciSubclass, + ), + }; + + PciConfiguration::new( + VIRTIO_PCI_VENDOR_ID, + pci_device_id, + 0x1, // For modern virtio-PCI devices + class, + subclass, + None, + PciHeaderType::Device, + VIRTIO_PCI_VENDOR_ID, + pci_device_id, + Some(msix_config.clone()), + pci_config_state, + ) + } + + fn msix_config( + pci_device_bdf: u32, + msix_vectors: Arc, + msix_config_state: Option, + ) -> Result>> { + let msix_config = Arc::new(Mutex::new(MsixConfig::new( + msix_vectors.num_vectors(), + msix_vectors, + pci_device_bdf, + msix_config_state, + )?)); + + Ok(msix_config) + } + + /// Constructs a new PCI transport for the given virtio device. + pub fn new( + id: String, + memory: GuestMemoryMmap, + device: Arc>, + msi_vectors: Arc, + pci_device_bdf: u32, + ) -> Result { + let num_queues = device.lock().expect("Poisoned lock").queues().len(); + + let msix_config = Self::msix_config(pci_device_bdf, msi_vectors.clone(), None)?; + let pci_config = Self::pci_configuration( + device.lock().expect("Poisoned lock").device_type(), + &msix_config, + None, + ); + + let virtio_common_config = VirtioPciCommonConfig::new(VirtioPciCommonConfigState { + driver_status: 0, + config_generation: 0, + device_feature_select: 0, + driver_feature_select: 0, + queue_select: 0, + msix_config: VIRTQ_MSI_NO_VECTOR, + msix_queues: vec![VIRTQ_MSI_NO_VECTOR; num_queues], + }); + let interrupt = Arc::new(VirtioInterruptMsix::new( + msix_config.clone(), + virtio_common_config.msix_config.clone(), + virtio_common_config.msix_queues.clone(), + msi_vectors.clone(), + )); + + let virtio_pci_device = VirtioPciDevice { + id, + pci_device_bdf: pci_device_bdf.into(), + configuration: pci_config, + common_config: virtio_common_config, + msix_config: Some(msix_config), + msix_num: msi_vectors.num_vectors(), + device, + device_activated: Arc::new(AtomicBool::new(false)), + interrupt_status: Arc::new(AtomicUsize::new(0)), + virtio_interrupt: Some(interrupt), + memory, + interrupt_source_group: msi_vectors, + cap_pci_cfg_info: VirtioPciCfgCapInfo::default(), + bar_region: PciBarConfiguration::default(), + }; + + Ok(virtio_pci_device) + } + + pub fn new_from_state( + id: String, + memory: GuestMemoryMmap, + device: Arc>, + msi_vectors: Arc, + state: VirtioPciDeviceState, + ) -> Result { + let msix_config = Self::msix_config( + state.pci_device_bdf.into(), + msi_vectors.clone(), + Some(state.msix_state), + )?; + + let pci_config = Self::pci_configuration( + device.lock().expect("Poisoned lock").device_type(), + &msix_config, + Some(state.pci_configuration_state), + ); + let virtio_common_config = VirtioPciCommonConfig::new(state.pci_dev_state); + let cap_pci_cfg_info = VirtioPciCfgCapInfo { + offset: state.cap_pci_cfg_offset, + cap: *VirtioPciCfgCap::from_slice(&state.cap_pci_cfg).unwrap(), + }; + + let interrupt = Arc::new(VirtioInterruptMsix::new( + msix_config.clone(), + virtio_common_config.msix_config.clone(), + virtio_common_config.msix_queues.clone(), + msi_vectors.clone(), + )); + + let virtio_pci_device = VirtioPciDevice { + id, + pci_device_bdf: state.pci_device_bdf, + configuration: pci_config, + common_config: virtio_common_config, + msix_config: Some(msix_config), + msix_num: msi_vectors.num_vectors(), + device, + device_activated: Arc::new(AtomicBool::new(state.device_activated)), + interrupt_status: Arc::new(AtomicUsize::new(state.interrupt_status)), + virtio_interrupt: Some(interrupt), + memory: memory.clone(), + interrupt_source_group: msi_vectors, + cap_pci_cfg_info, + bar_region: state.bar_configuration, + }; + + if state.device_activated { + virtio_pci_device + .device + .lock() + .expect("Poisoned lock") + .activate( + memory, + virtio_pci_device.virtio_interrupt.as_ref().unwrap().clone(), + ); + } + + Ok(virtio_pci_device) + } + + fn is_driver_ready(&self) -> bool { + let ready_bits = + (DEVICE_ACKNOWLEDGE | DEVICE_DRIVER | DEVICE_DRIVER_OK | DEVICE_FEATURES_OK); + self.common_config.driver_status == ready_bits + && self.common_config.driver_status & DEVICE_FAILED == 0 + } + + /// Determines if the driver has requested the device (re)init / reset itself + fn is_driver_init(&self) -> bool { + self.common_config.driver_status == DEVICE_INIT + } + + pub fn config_bar_addr(&self) -> u64 { + self.configuration.get_bar_addr(VIRTIO_BAR_INDEX as usize) + } + + fn add_pci_capabilities(&mut self) -> std::result::Result<(), PciDeviceError> { + // Add pointers to the different configuration structures from the PCI capabilities. + let common_cap = VirtioPciCap::new( + PciCapabilityType::Common, + COMMON_CONFIG_BAR_OFFSET.try_into().unwrap(), + COMMON_CONFIG_SIZE.try_into().unwrap(), + ); + self.configuration + .add_capability(&common_cap) + .map_err(PciDeviceError::CapabilitiesSetup)?; + + let isr_cap = VirtioPciCap::new( + PciCapabilityType::Isr, + ISR_CONFIG_BAR_OFFSET.try_into().unwrap(), + ISR_CONFIG_SIZE.try_into().unwrap(), + ); + self.configuration + .add_capability(&isr_cap) + .map_err(PciDeviceError::CapabilitiesSetup)?; + + // TODO(dgreid) - set based on device's configuration size? + let device_cap = VirtioPciCap::new( + PciCapabilityType::Device, + DEVICE_CONFIG_BAR_OFFSET.try_into().unwrap(), + DEVICE_CONFIG_SIZE.try_into().unwrap(), + ); + self.configuration + .add_capability(&device_cap) + .map_err(PciDeviceError::CapabilitiesSetup)?; + + let notify_cap = VirtioPciNotifyCap::new( + PciCapabilityType::Notify, + NOTIFICATION_BAR_OFFSET.try_into().unwrap(), + NOTIFICATION_SIZE.try_into().unwrap(), + Le32::from(NOTIFY_OFF_MULTIPLIER), + ); + self.configuration + .add_capability(¬ify_cap) + .map_err(PciDeviceError::CapabilitiesSetup)?; + + let configuration_cap = VirtioPciCfgCap::new(); + self.cap_pci_cfg_info.offset = self + .configuration + .add_capability(&configuration_cap) + .map_err(PciDeviceError::CapabilitiesSetup)? + + VIRTIO_PCI_CAP_OFFSET; + self.cap_pci_cfg_info.cap = configuration_cap; + + if self.msix_config.is_some() { + let msix_cap = MsixCap::new( + VIRTIO_BAR_INDEX, + self.msix_num, + MSIX_TABLE_BAR_OFFSET.try_into().unwrap(), + VIRTIO_BAR_INDEX, + MSIX_PBA_BAR_OFFSET.try_into().unwrap(), + ); + self.configuration + .add_capability(&msix_cap) + .map_err(PciDeviceError::CapabilitiesSetup)?; + } + + Ok(()) + } + + fn read_cap_pci_cfg(&mut self, offset: usize, mut data: &mut [u8]) { + let cap_slice = self.cap_pci_cfg_info.cap.as_slice(); + let data_len = data.len(); + let cap_len = cap_slice.len(); + if offset + data_len > cap_len { + error!("Failed to read cap_pci_cfg from config space"); + return; + } + + if offset < std::mem::size_of::() { + if let Some(end) = offset.checked_add(data_len) { + // This write can't fail, offset and end are checked against config_len. + data.write_all(&cap_slice[offset..cmp::min(end, cap_len)]) + .unwrap(); + } + } else { + let bar_offset: u32 = + // SAFETY: we know self.cap_pci_cfg_info.cap.cap.offset is 32bits long. + unsafe { std::mem::transmute(self.cap_pci_cfg_info.cap.cap.offset) }; + self.read_bar(0, bar_offset as u64, data) + } + } + + fn write_cap_pci_cfg(&mut self, offset: usize, data: &[u8]) -> Option> { + let cap_slice = self.cap_pci_cfg_info.cap.as_mut_slice(); + let data_len = data.len(); + let cap_len = cap_slice.len(); + if offset + data_len > cap_len { + error!("Failed to write cap_pci_cfg to config space"); + return None; + } + + if offset < std::mem::size_of::() { + let (_, right) = cap_slice.split_at_mut(offset); + right[..data_len].copy_from_slice(data); + None + } else { + let bar_offset: u32 = + // SAFETY: we know self.cap_pci_cfg_info.cap.cap.offset is 32bits long. + unsafe { std::mem::transmute(self.cap_pci_cfg_info.cap.cap.offset) }; + self.write_bar(0, bar_offset as u64, data) + } + } + + pub fn virtio_device(&self) -> Arc> { + self.device.clone() + } + + fn needs_activation(&self) -> bool { + !self.device_activated.load(Ordering::SeqCst) && self.is_driver_ready() + } + + /// Register the IoEvent notification for a VirtIO device + pub fn register_notification_ioevent(&self, vm: &Vm) -> std::result::Result<(), errno::Error> { + let bar_addr = self.config_bar_addr(); + for (i, queue_evt) in self + .device + .lock() + .expect("Poisoned lock") + .queue_events() + .iter() + .enumerate() + { + let notify_base = bar_addr + NOTIFICATION_BAR_OFFSET; + let io_addr = + IoEventAddress::Mmio(notify_base + i as u64 * NOTIFY_OFF_MULTIPLIER as u64); + vm.fd().register_ioevent(queue_evt, &io_addr, NoDatamatch)?; + } + Ok(()) + } + + /// Unregister the IoEvent notification for a VirtIO device + pub fn unregister_notification_ioevent( + &self, + vm: &Vm, + ) -> std::result::Result<(), errno::Error> { + let bar_addr = self.config_bar_addr(); + for (i, queue_evt) in self + .device + .lock() + .expect("Poisoned lock") + .queue_events() + .iter() + .enumerate() + { + let notify_base = bar_addr + NOTIFICATION_BAR_OFFSET; + let io_addr = + IoEventAddress::Mmio(notify_base + i as u64 * NOTIFY_OFF_MULTIPLIER as u64); + vm.fd() + .unregister_ioevent(queue_evt, &io_addr, NoDatamatch)?; + } + Ok(()) + } + + pub fn state(&self) -> VirtioPciDeviceState { + VirtioPciDeviceState { + pci_device_bdf: self.pci_device_bdf, + device_activated: self.device_activated.load(Ordering::Acquire), + interrupt_status: self.interrupt_status.load(Ordering::Acquire), + cap_pci_cfg_offset: self.cap_pci_cfg_info.offset, + cap_pci_cfg: self.cap_pci_cfg_info.cap.bytes().to_vec(), + pci_configuration_state: self.configuration.state(), + pci_dev_state: self.common_config.state(), + msix_state: self + .msix_config + .as_ref() + .unwrap() + .lock() + .expect("Poisoned lock") + .state(), + msi_vector_group: self.interrupt_source_group.save(), + bar_configuration: self.bar_region, + } + } +} + +pub struct VirtioInterruptMsix { + msix_config: Arc>, + config_vector: Arc, + queues_vectors: Arc>>, + interrupt_source_group: Arc, +} + +impl std::fmt::Debug for VirtioInterruptMsix { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("VirtioInterruptMsix") + .field("msix_config", &self.msix_config) + .field("config_vector", &self.config_vector) + .field("queues_vectors", &self.queues_vectors) + .finish() + } +} + +impl VirtioInterruptMsix { + pub fn new( + msix_config: Arc>, + config_vector: Arc, + queues_vectors: Arc>>, + interrupt_source_group: Arc, + ) -> Self { + VirtioInterruptMsix { + msix_config, + config_vector, + queues_vectors, + interrupt_source_group, + } + } +} + +impl VirtioInterrupt for VirtioInterruptMsix { + fn trigger(&self, int_type: VirtioInterruptType) -> std::result::Result<(), std::io::Error> { + let vector = match int_type { + VirtioInterruptType::Config => self.config_vector.load(Ordering::Acquire), + VirtioInterruptType::Queue(queue_index) => *self + .queues_vectors + .lock() + .unwrap() + .get(queue_index as usize) + .ok_or(ErrorKind::InvalidInput)?, + }; + + if vector == VIRTQ_MSI_NO_VECTOR { + return Ok(()); + } + + let config = &mut self.msix_config.lock().unwrap(); + let entry = &config.table_entries[vector as usize]; + // In case the vector control register associated with the entry + // has its first bit set, this means the vector is masked and the + // device should not inject the interrupt. + // Instead, the Pending Bit Array table is updated to reflect there + // is a pending interrupt for this specific vector. + if config.masked || entry.masked() { + config.set_pba_bit(vector, false); + return Ok(()); + } + + self.interrupt_source_group + .trigger(vector as InterruptIndex) + } + + fn notifier(&self, int_type: VirtioInterruptType) -> Option<&EventFd> { + let vector = match int_type { + VirtioInterruptType::Config => self.config_vector.load(Ordering::Acquire), + VirtioInterruptType::Queue(queue_index) => *self + .queues_vectors + .lock() + .unwrap() + .get(queue_index as usize)?, + }; + + self.interrupt_source_group + .notifier(vector as InterruptIndex) + } + + fn status(&self) -> Arc { + Arc::new(AtomicU32::new(0)) + } + + #[cfg(test)] + fn has_pending_interrupt(&self, interrupt_type: VirtioInterruptType) -> bool { + false + } +} + +impl PciDevice for VirtioPciDevice { + fn write_config_register( + &mut self, + reg_idx: usize, + offset: u64, + data: &[u8], + ) -> Option> { + // Handle the special case where the capability VIRTIO_PCI_CAP_PCI_CFG + // is accessed. This capability has a special meaning as it allows the + // guest to access other capabilities without mapping the PCI BAR. + let base = reg_idx * 4; + if base + u64_to_usize(offset) >= self.cap_pci_cfg_info.offset + && base + u64_to_usize(offset) + data.len() + <= self.cap_pci_cfg_info.offset + self.cap_pci_cfg_info.cap.bytes().len() + { + let offset = base + u64_to_usize(offset) - self.cap_pci_cfg_info.offset; + self.write_cap_pci_cfg(offset, data) + } else { + self.configuration + .write_config_register(reg_idx, offset, data); + None + } + } + + fn read_config_register(&mut self, reg_idx: usize) -> u32 { + // Handle the special case where the capability VIRTIO_PCI_CAP_PCI_CFG + // is accessed. This capability has a special meaning as it allows the + // guest to access other capabilities without mapping the PCI BAR. + let base = reg_idx * 4; + if base >= self.cap_pci_cfg_info.offset + && base + 4 <= self.cap_pci_cfg_info.offset + self.cap_pci_cfg_info.cap.bytes().len() + { + let offset = base - self.cap_pci_cfg_info.offset; + let mut data = [0u8; 4]; + self.read_cap_pci_cfg(offset, &mut data); + u32::from_le_bytes(data) + } else { + self.configuration.read_reg(reg_idx) + } + } + + fn detect_bar_reprogramming( + &mut self, + reg_idx: usize, + data: &[u8], + ) -> Option { + self.configuration.detect_bar_reprogramming(reg_idx, data) + } + + fn allocate_bars( + &mut self, + mmio32_allocator: &mut AddressAllocator, + mmio64_allocator: &mut AddressAllocator, + ) -> std::result::Result<(), PciDeviceError> { + let device_clone = self.device.clone(); + let device = device_clone.lock().unwrap(); + + // Allocate the virtio-pci capability BAR. + // See http://docs.oasis-open.org/virtio/virtio/v1.0/cs04/virtio-v1.0-cs04.html#x1-740004 + let virtio_pci_bar_addr = mmio64_allocator + .allocate( + CAPABILITY_BAR_SIZE, + CAPABILITY_BAR_SIZE, + AllocPolicy::FirstMatch, + ) + .unwrap() + .start(); + + let bar = PciBarConfiguration { + addr: virtio_pci_bar_addr, + size: CAPABILITY_BAR_SIZE, + idx: VIRTIO_COMMON_BAR_INDEX, + region_type: PciBarRegionType::Memory64BitRegion, + prefetchable: pci::PciBarPrefetchable::NotPrefetchable, + }; + + // The creation of the PCI BAR and its associated capabilities must + // happen only during the creation of a brand new VM. When a VM is + // restored from a known state, the BARs are already created with the + // right content, therefore we don't need to go through this codepath. + self.configuration + .add_pci_bar(&bar) + .map_err(|e| PciDeviceError::IoRegistrationFailed(virtio_pci_bar_addr, e))?; + + // Once the BARs are allocated, the capabilities can be added to the PCI configuration. + self.add_pci_capabilities()?; + self.bar_region = bar; + + Ok(()) + } + + fn move_bar( + &mut self, + old_base: u64, + new_base: u64, + ) -> std::result::Result<(), std::io::Error> { + // We only update our idea of the bar in order to support free_bars() above. + // The majority of the reallocation is done inside DeviceManager. + if self.bar_region.addr == old_base { + self.bar_region.addr = new_base; + } + + Ok(()) + } + + fn read_bar(&mut self, _base: u64, offset: u64, data: &mut [u8]) { + match offset { + o if o < COMMON_CONFIG_BAR_OFFSET + COMMON_CONFIG_SIZE => { + self.common_config + .read(o - COMMON_CONFIG_BAR_OFFSET, data, self.device.clone()) + } + o if (ISR_CONFIG_BAR_OFFSET..ISR_CONFIG_BAR_OFFSET + ISR_CONFIG_SIZE).contains(&o) => { + if let Some(v) = data.get_mut(0) { + // Reading this register resets it to 0. + *v = self + .interrupt_status + .swap(0, Ordering::AcqRel) + .try_into() + .unwrap(); + } + } + o if (DEVICE_CONFIG_BAR_OFFSET..DEVICE_CONFIG_BAR_OFFSET + DEVICE_CONFIG_SIZE) + .contains(&o) => + { + let device = self.device.lock().unwrap(); + device.read_config(o - DEVICE_CONFIG_BAR_OFFSET, data); + } + o if (NOTIFICATION_BAR_OFFSET..NOTIFICATION_BAR_OFFSET + NOTIFICATION_SIZE) + .contains(&o) => + { + // Handled with ioeventfds. + } + o if (MSIX_TABLE_BAR_OFFSET..MSIX_TABLE_BAR_OFFSET + MSIX_TABLE_SIZE).contains(&o) => { + if let Some(msix_config) = &self.msix_config { + msix_config + .lock() + .unwrap() + .read_table(o - MSIX_TABLE_BAR_OFFSET, data); + } + } + o if (MSIX_PBA_BAR_OFFSET..MSIX_PBA_BAR_OFFSET + MSIX_PBA_SIZE).contains(&o) => { + if let Some(msix_config) = &self.msix_config { + msix_config + .lock() + .unwrap() + .read_pba(o - MSIX_PBA_BAR_OFFSET, data); + } + } + _ => (), + } + } + + fn write_bar(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { + match offset { + o if o < COMMON_CONFIG_BAR_OFFSET + COMMON_CONFIG_SIZE => { + self.common_config + .write(o - COMMON_CONFIG_BAR_OFFSET, data, self.device.clone()) + } + o if (ISR_CONFIG_BAR_OFFSET..ISR_CONFIG_BAR_OFFSET + ISR_CONFIG_SIZE).contains(&o) => { + if let Some(v) = data.first() { + self.interrupt_status + .fetch_and(!(*v as usize), Ordering::AcqRel); + } + } + o if (DEVICE_CONFIG_BAR_OFFSET..DEVICE_CONFIG_BAR_OFFSET + DEVICE_CONFIG_SIZE) + .contains(&o) => + { + let mut device = self.device.lock().unwrap(); + device.write_config(o - DEVICE_CONFIG_BAR_OFFSET, data); + } + o if (NOTIFICATION_BAR_OFFSET..NOTIFICATION_BAR_OFFSET + NOTIFICATION_SIZE) + .contains(&o) => + { + // Handled with ioeventfds. + error!("Unexpected write to notification BAR: offset = 0x{:x}", o); + } + o if (MSIX_TABLE_BAR_OFFSET..MSIX_TABLE_BAR_OFFSET + MSIX_TABLE_SIZE).contains(&o) => { + if let Some(msix_config) = &self.msix_config { + msix_config + .lock() + .unwrap() + .write_table(o - MSIX_TABLE_BAR_OFFSET, data); + } + } + o if (MSIX_PBA_BAR_OFFSET..MSIX_PBA_BAR_OFFSET + MSIX_PBA_SIZE).contains(&o) => { + if let Some(msix_config) = &self.msix_config { + msix_config + .lock() + .unwrap() + .write_pba(o - MSIX_PBA_BAR_OFFSET, data); + } + } + _ => (), + }; + + // Try and activate the device if the driver status has changed + if self.needs_activation() { + debug!("Activating device"); + let interrupt = Arc::clone(self.virtio_interrupt.as_ref().unwrap()); + match self + .virtio_device() + .lock() + .unwrap() + .activate(self.memory.clone(), interrupt.clone()) + { + Ok(()) => self.device_activated.store(true, Ordering::SeqCst), + Err(err) => { + error!("Error activating device: {err:?}"); + + // Section 2.1.2 of the specification states that we need to send a device + // configuration change interrupt + let _ = interrupt.trigger(VirtioInterruptType::Config); + } + } + } else { + debug!("Device doesn't need activation"); + } + + // Device has been reset by the driver + if self.device_activated.load(Ordering::SeqCst) && self.is_driver_init() { + let mut device = self.device.lock().unwrap(); + let reset_result = device.reset(); + match reset_result { + Some((virtio_interrupt, mut _queue_evts)) => { + // Upon reset the device returns its interrupt EventFD + self.virtio_interrupt = Some(virtio_interrupt); + self.device_activated.store(false, Ordering::SeqCst); + + // Reset queue readiness (changes queue_enable), queue sizes + // and selected_queue as per spec for reset + self.virtio_device() + .lock() + .unwrap() + .queues_mut() + .iter_mut() + .for_each(Queue::reset); + self.common_config.queue_select = 0; + } + None => { + error!("Attempt to reset device when not implemented in underlying device"); + self.common_config.driver_status = DEVICE_FAILED; + } + } + } + + None + } +} + +impl BusDevice for VirtioPciDevice { + fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) { + self.read_bar(base, offset, data) + } + + fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option> { + self.write_bar(base, offset, data) + } +} + +#[cfg(test)] +mod tests { + use std::sync::{Arc, Mutex}; + + use event_manager::MutEventSubscriber; + use linux_loader::loader::Cmdline; + use pci::{PciBdf, PciClassCode, PciDevice, PciSubclass}; + + use super::VirtioPciDevice; + use crate::Vm; + use crate::arch::MEM_64BIT_DEVICES_START; + use crate::builder::tests::default_vmm; + use crate::devices::virtio::device::VirtioDevice; + use crate::devices::virtio::rng::Entropy; + use crate::devices::virtio::transport::pci::device::PciVirtioSubclass; + use crate::rate_limiter::RateLimiter; + + #[test] + fn test_pci_device_config() { + let mut vmm = default_vmm(); + vmm.device_manager.enable_pci(&vmm.vm); + let entropy = Arc::new(Mutex::new(Entropy::new(RateLimiter::default()).unwrap())); + vmm.device_manager + .attach_virtio_device( + &vmm.vm, + "rng".to_string(), + entropy.clone(), + &mut Cmdline::new(1024).unwrap(), + false, + ) + .unwrap(); + + let device = vmm + .device_manager + .pci_devices + .get_virtio_device(entropy.lock().unwrap().device_type(), "rng") + .unwrap(); + + let mut locked_virtio_pci_device = device.lock().unwrap(); + + // For more information for the values we are checking here look into the VirtIO spec here: + // https://docs.oasis-open.org/virtio/virtio/v1.1/csprd01/virtio-v1.1-csprd01.html#x1-1220007 + // and PCI Header type 0 layout here: https://wiki.osdev.org/PCI#Configuration_Space + + // | 16 bits | 16 bits | + // |-----------|-----------| + // regiger 0x0: | Device ID | Vendor ID | + // + // Vendor ID of VirtIO devices is 0x1af4 + let reg0 = locked_virtio_pci_device.read_config_register(0); + assert_eq!(reg0 & 0xffff, 0x1af4); + // VirtIO PCI device IDs are in the range [0x1000, 0x107f]. (We are not using transitional + // device IDs). + let devid = reg0 >> 16; + assert!( + (0x1000..=0x107f).contains(&devid), + "Device ID check: {:#x} >= 0x1000 && {:#x} <= 0x107f", + devid, + devid + ); + + // | 16 bits | 16 bits | + // |------------|-----------| + // regiger 0x1: | Status | Command | + // We offer the capabilities list (bit 4 of status register) at offset 0x34 + let reg1 = locked_virtio_pci_device.read_config_register(1); + assert_eq!(reg1, 0x0010_0000); + + // | 8 bits | 8 bits | 8 bits | 8 bits | + // register 0x2: | Class code | Subclass | Prog IF | Revision ID | + // + // Class code: VIRTIO_PCI_VENDOR_ID for all VirtIO devices + // Subclass: PciClassCode::NetworkController for net, PciClassCode::MassStore for block + // PciClassCode::Other for everything else + // Prog IF: A register defining some programmable interface register. 0 for VirtIO devices + // Revision ID: 0x1 for modern VirtIO devices + let reg2 = locked_virtio_pci_device.read_config_register(2); + assert_eq!(reg2, 0xffff_0001); + let class_code = ((reg2 >> 24) & 0xff) as u8; + assert_eq!(class_code, PciClassCode::Other.get_register_value()); + let subclass = ((reg2 >> 16) & 0xff) as u8; + assert_eq!( + subclass, + PciVirtioSubclass::NonTransitionalBase.get_register_value() + ); + let prog_if = ((reg2 >> 8) & 0xff) as u8; + assert_eq!(prog_if, 0); + let revision_id = reg2 & 0xff; + assert_eq!(revision_id, 0x1); + + // | 8 bits | 8 bits | 8 bits | 8 bits | + // register 0x3: | BIST | Header Type | Latency timer | Cache line size | + // + // BIST: status and control for self test of PCI devices. Always 0 for VirtIO devices + // HeaderType: 0x0 for general devices + // LatencyTimer: Latency timer in units of PCI bus clocks, 0 for VirtIO + // Cache Line size: 0 for VirtIO devices + let reg3 = locked_virtio_pci_device.read_config_register(3); + assert_eq!(reg3, 0x0); + + // register 0xa: Cardbus CIS pointer + // + // We don't emulate CardBus + let reg10 = locked_virtio_pci_device.read_config_register(0xa); + assert_eq!(reg10, 0); + + // | 16 bits | 16 bits | + // regiger 0xb: | Subsystem ID | Subsystem vendor ID| + // + // For us Subsystem ID is same as device ID and subsystem vendor ID is same as vendor ID + // (reg 0x0) + let reg11 = locked_virtio_pci_device.read_config_register(0xb); + assert_eq!(reg11, reg0); + + // register 0xc: Expansion ROM base address: 0x0 for us + let reg12 = locked_virtio_pci_device.read_config_register(0xc); + assert_eq!(reg12, 0); + + // | 24 bits | 8 bits | + // register 0xd: | Reserved | Capabilities pointer | + let reg13 = locked_virtio_pci_device.read_config_register(0xd); + assert_eq!(reg13 >> 24, 0); + + // register 0xe: Reserved + let reg14 = locked_virtio_pci_device.read_config_register(0xe); + assert_eq!(reg14, 0); + + // | 8 bits | 8 bits | 8 bits | 8 bits | + // register 0xf: | max latency | min grant | Interrupt pin | Interrupt line | + // + // We don't specify any of those + let reg15 = locked_virtio_pci_device.read_config_register(0xf); + assert_eq!(reg15, 0); + } + + #[test] + fn test_reading_bars() { + let mut vmm = default_vmm(); + vmm.device_manager.enable_pci(&vmm.vm); + let entropy = Arc::new(Mutex::new(Entropy::new(RateLimiter::default()).unwrap())); + vmm.device_manager + .attach_virtio_device( + &vmm.vm, + "rng".to_string(), + entropy.clone(), + &mut Cmdline::new(1024).unwrap(), + false, + ) + .unwrap(); + + let device = vmm + .device_manager + .pci_devices + .get_virtio_device(entropy.lock().unwrap().device_type(), "rng") + .unwrap(); + + let mut locked_virtio_pci_device = device.lock().unwrap(); + + // According to OSdev wiki (https://wiki.osdev.org/PCI#Configuration_Space): + // + // When you want to retrieve the actual base address of a BAR, be sure to mask the lower + // bits. For 16-bit Memory Space BARs, you calculate (BAR[x] & 0xFFF0). For 32-bit Memory + // Space BARs, you calculate (BAR[x] & 0xFFFFFFF0). For 64-bit Memory Space BARs, you + // calculate ((BAR[x] & 0xFFFFFFF0) + ((BAR[x + 1] & 0xFFFFFFFF) << 32)) For I/O Space + // BARs, you calculate (BAR[x] & 0xFFFFFFFC). + + // We are allocating a single 64-bit MMIO bar for VirtIO capabilities list. As a result, we + // are using the first two BAR registers from the configuration space. + // + // The BAR address layout is as follows: + // + // | Bits 31-4 | Bit 3 | Bits 2-1 | Bit 0 | + // | 16-Byte Aligned Base Address | Prefetchable | Type | Always 0 | + // + // For 64-bit addresses though a second BAR is used to hold the upper 32 bits + // of the address. Prefetchable and type will be help in the lower bits of the + // first bar along with the lower 32-bits of the address which is always 16-bytes + // aligned. + let bar_addr_lo = locked_virtio_pci_device.read_config_register(0x4); + let bar_addr_hi = locked_virtio_pci_device.read_config_register(0x5); + let bar_addr = bar_addr_lo as u64 + ((bar_addr_hi as u64) << 32); + + // Bit 0 always 0 + assert_eq!(bar_addr & 0x1, 0); + // Type is 0x2 meaning 64-bit BAR + assert_eq!((bar_addr & 0x6) >> 1, 2); + // The actual address of the BAR should be the first available address of our 64-bit MMIO + // region + assert_eq!(bar_addr & 0xffff_ffff_ffff_fff0, MEM_64BIT_DEVICES_START); + + // Reading the BAR size is a bit more convoluted. According to OSDev wiki: + // + // To determine the amount of address space needed by a PCI device, you must save the + // original value of the BAR, write a value of all 1's to the register, then read it back. + // The amount of memory can then be determined by masking the information bits, performing + // a bitwise NOT ('~' in C), and incrementing the value by 1. + + locked_virtio_pci_device.write_config_register(0x4, 0, &[0xff, 0xff, 0xff, 0xff]); + // Read the lower size bits and mask out the last 4 bits include Prefetchable, Type and + // hardwired-0 + let bar_size_lo = locked_virtio_pci_device.read_config_register(0x4) as u64 & 0xfffffff0; + locked_virtio_pci_device.write_config_register(0x5, 0, &[0xff, 0xff, 0xff, 0xff]); + let bar_size_hi = locked_virtio_pci_device.read_config_register(0x5) as u64; + let bar_size = !((bar_size_hi << 32) | bar_size_lo) + 1; + + // We create a capabilities BAR region of 0x80000 bytes + assert_eq!(bar_size, 0x80000); + } +} diff --git a/src/vmm/src/devices/virtio/transport/pci/mod.rs b/src/vmm/src/devices/virtio/transport/pci/mod.rs new file mode 100644 index 00000000000..520b52274b3 --- /dev/null +++ b/src/vmm/src/devices/virtio/transport/pci/mod.rs @@ -0,0 +1,5 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +pub mod common_config; +pub mod device; diff --git a/src/vmm/src/devices/virtio/vhost_user.rs b/src/vmm/src/devices/virtio/vhost_user.rs index 13b0d71b35a..556a8adafaf 100644 --- a/src/vmm/src/devices/virtio/vhost_user.rs +++ b/src/vmm/src/devices/virtio/vhost_user.rs @@ -6,6 +6,7 @@ use std::os::fd::AsRawFd; use std::os::unix::net::UnixStream; +use std::sync::Arc; use vhost::vhost_user::message::*; use vhost::vhost_user::{Frontend, VhostUserFrontend}; @@ -13,8 +14,8 @@ use vhost::{Error as VhostError, VhostBackend, VhostUserMemoryRegionInfo, VringC use vm_memory::{Address, Error as MmapError, GuestMemory, GuestMemoryError, GuestMemoryRegion}; use vmm_sys_util::eventfd::EventFd; -use crate::devices::virtio::device::IrqTrigger; use crate::devices::virtio::queue::Queue; +use crate::devices::virtio::transport::{VirtioInterrupt, VirtioInterruptType}; use crate::vstate::memory::GuestMemoryMmap; /// vhost-user error. @@ -400,7 +401,7 @@ impl VhostUserHandleImpl { &mut self, mem: &GuestMemoryMmap, queues: &[(usize, &Queue, &EventFd)], - irq_trigger: &IrqTrigger, + interrupt: Arc, ) -> Result<(), VhostUserError> { // Provide the memory table to the backend. self.update_mem_table(mem)?; @@ -442,7 +443,17 @@ impl VhostUserHandleImpl { // No matter the queue, we set irq_evt for signaling the guest that buffers were // consumed. self.vu - .set_vring_call(*queue_index, &irq_trigger.irq_evt) + .set_vring_call( + *queue_index, + interrupt + .notifier(VirtioInterruptType::Queue( + (*queue_index).try_into().unwrap_or_else(|_| { + panic!("vhost-user: invalid queue index: {}", *queue_index) + }), + )) + .as_ref() + .unwrap(), + ) .map_err(VhostUserError::VhostUserSetVringCall)?; self.vu @@ -467,6 +478,7 @@ pub(crate) mod tests { use vmm_sys_util::tempfile::TempFile; use super::*; + use crate::devices::virtio::test_utils::default_interrupt; use crate::test_utils::create_tmp_socket; use crate::vstate::memory; use crate::vstate::memory::GuestAddress; @@ -901,11 +913,11 @@ pub(crate) mod tests { queue.initialize(&guest_memory).unwrap(); let event_fd = EventFd::new(0).unwrap(); - let irq_trigger = IrqTrigger::new().unwrap(); let queues = [(0, &queue, &event_fd)]; - vuh.setup_backend(&guest_memory, &queues, &irq_trigger) + let interrupt = default_interrupt(); + vuh.setup_backend(&guest_memory, &queues, interrupt.clone()) .unwrap(); // VhostUserHandleImpl should correctly send memory and queues information to @@ -929,7 +941,11 @@ pub(crate) mod tests { log_addr: None, }, base: queue.avail_ring_idx_get(), - call: irq_trigger.irq_evt.as_raw_fd(), + call: interrupt + .notifier(VirtioInterruptType::Queue(0u16)) + .as_ref() + .unwrap() + .as_raw_fd(), kick: event_fd.as_raw_fd(), enable: true, }; diff --git a/src/vmm/src/devices/virtio/vsock/device.rs b/src/vmm/src/devices/virtio/vsock/device.rs index a4377768322..43c9d4cb2ba 100644 --- a/src/vmm/src/devices/virtio/vsock/device.rs +++ b/src/vmm/src/devices/virtio/vsock/device.rs @@ -6,7 +6,7 @@ // found in the THIRD-PARTY file. //! This is the `VirtioDevice` implementation for our vsock device. It handles the virtio-level -//! device logic: feature negociation, device configuration, and device activation. +//! device logic: feature negotiation, device configuration, and device activation. //! //! We aim to conform to the VirtIO v1.1 spec: //! https://docs.oasis-open.org/virtio/virtio/v1.1/virtio-v1.1.html @@ -21,8 +21,10 @@ //! - a backend FD. use std::fmt::Debug; +use std::ops::Deref; +use std::sync::Arc; -use log::{error, warn}; +use log::{error, info, warn}; use vmm_sys_util::eventfd::EventFd; use super::super::super::DeviceError; @@ -30,9 +32,10 @@ use super::defs::uapi; use super::packet::{VSOCK_PKT_HDR_SIZE, VsockPacketRx, VsockPacketTx}; use super::{VsockBackend, defs}; use crate::devices::virtio::ActivateError; -use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice}; +use crate::devices::virtio::device::{ActiveState, DeviceState, VirtioDevice}; use crate::devices::virtio::generated::virtio_config::{VIRTIO_F_IN_ORDER, VIRTIO_F_VERSION_1}; use crate::devices::virtio::queue::{InvalidAvailIdx, Queue as VirtQueue}; +use crate::devices::virtio::transport::{VirtioInterrupt, VirtioInterruptType}; use crate::devices::virtio::vsock::VsockError; use crate::devices::virtio::vsock::metrics::METRICS; use crate::logger::IncMetric; @@ -61,7 +64,6 @@ pub struct Vsock { pub(crate) backend: B, pub(crate) avail_features: u64, pub(crate) acked_features: u64, - pub(crate) irq_trigger: IrqTrigger, // This EventFd is the only one initially registered for a vsock device, and is used to convert // a VirtioDevice::activate call into an EventHandler read event which allows the other events // (queue and backend related) to be registered post virtio device activation. That's @@ -102,7 +104,6 @@ where backend, avail_features: AVAIL_FEATURES, acked_features: 0, - irq_trigger: IrqTrigger::new().map_err(VsockError::EventFd)?, activate_evt: EventFd::new(libc::EFD_NONBLOCK).map_err(VsockError::EventFd)?, device_state: DeviceState::Inactive, rx_packet: VsockPacketRx::new()?, @@ -136,9 +137,24 @@ where /// Signal the guest driver that we've used some virtio buffers that it had previously made /// available. - pub fn signal_used_queue(&self) -> Result<(), DeviceError> { - self.irq_trigger - .trigger_irq(IrqType::Vring) + pub fn signal_used_queue(&self, qidx: usize) -> Result<(), DeviceError> { + self.device_state + .active_state() + .expect("Device is not initialized") + .interrupt + .trigger(VirtioInterruptType::Queue(qidx.try_into().unwrap_or_else( + |_| panic!("vsock: invalid queue index: {qidx}"), + ))) + .map_err(DeviceError::FailedSignalingIrq) + } + + /// Signal the guest which queues are ready to be consumed + pub fn signal_used_queues(&self, used_queues: &[u16]) -> Result<(), DeviceError> { + self.device_state + .active_state() + .expect("Device is not initialized") + .interrupt + .trigger_queues(used_queues) .map_err(DeviceError::FailedSignalingIrq) } @@ -147,7 +163,7 @@ where /// otherwise. pub fn process_rx(&mut self) -> Result { // This is safe since we checked in the event handler that the device is activated. - let mem = self.device_state.mem().unwrap(); + let mem = &self.device_state.active_state().unwrap().mem; let queue = &mut self.queues[RXQ_INDEX]; let mut have_used = false; @@ -200,7 +216,7 @@ where /// ring, and `false` otherwise. pub fn process_tx(&mut self) -> Result { // This is safe since we checked in the event handler that the device is activated. - let mem = self.device_state.mem().unwrap(); + let mem = &self.device_state.active_state().unwrap().mem; let queue = &mut self.queues[TXQ_INDEX]; let mut have_used = false; @@ -240,7 +256,7 @@ where // remain but their CID is updated to reflect the current guest_cid. pub fn send_transport_reset_event(&mut self) -> Result<(), DeviceError> { // This is safe since we checked in the caller function that the device is activated. - let mem = self.device_state.mem().unwrap(); + let mem = &self.device_state.active_state().unwrap().mem; let queue = &mut self.queues[EVQ_INDEX]; let head = queue.pop()?.ok_or_else(|| { @@ -256,7 +272,7 @@ where }); queue.advance_used_ring_idx(); - self.signal_used_queue()?; + self.signal_used_queue(EVQ_INDEX)?; Ok(()) } @@ -294,8 +310,12 @@ where &self.queue_events } - fn interrupt_trigger(&self) -> &IrqTrigger { - &self.irq_trigger + fn interrupt_trigger(&self) -> &dyn VirtioInterrupt { + self.device_state + .active_state() + .expect("Device is not initialized") + .interrupt + .deref() } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -327,7 +347,11 @@ where ); } - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate( + &mut self, + mem: GuestMemoryMmap, + interrupt: Arc, + ) -> Result<(), ActivateError> { for q in self.queues.iter_mut() { q.initialize(&mem) .map_err(ActivateError::QueueMemoryError)?; @@ -346,7 +370,7 @@ where return Err(ActivateError::EventFd); } - self.device_state = DeviceState::Activated(mem); + self.device_state = DeviceState::Activated(ActiveState { mem, interrupt }); Ok(()) } @@ -354,6 +378,19 @@ where fn is_activated(&self) -> bool { self.device_state.is_activated() } + + fn kick(&mut self) { + // Vsock has complicated protocol that isn't resilient to any packet loss, + // so for Vsock we don't support connection persistence through snapshot. + // Any in-flight packets or events are simply lost. + // Vsock is restored 'empty'. + // The only reason we still `kick` it is to make guest process + // `TRANSPORT_RESET_EVENT` event we sent during snapshot creation. + if self.is_activated() { + info!("kick vsock {}.", self.id()); + self.signal_used_queue(EVQ_INDEX).unwrap(); + } + } } #[cfg(test)] @@ -429,6 +466,8 @@ mod tests { // } // Test a correct activation. - ctx.device.activate(ctx.mem.clone()).unwrap(); + ctx.device + .activate(ctx.mem.clone(), ctx.interrupt.clone()) + .unwrap(); } } diff --git a/src/vmm/src/devices/virtio/vsock/event_handler.rs b/src/vmm/src/devices/virtio/vsock/event_handler.rs index 59fbd3eaa3d..a983a332aa3 100755 --- a/src/vmm/src/devices/virtio/vsock/event_handler.rs +++ b/src/vmm/src/devices/virtio/vsock/event_handler.rs @@ -34,6 +34,7 @@ use super::VsockBackend; use super::device::{EVQ_INDEX, RXQ_INDEX, TXQ_INDEX, Vsock}; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::queue::InvalidAvailIdx; +use crate::devices::virtio::vsock::defs::VSOCK_NUM_QUEUES; use crate::devices::virtio::vsock::metrics::METRICS; use crate::logger::IncMetric; @@ -47,81 +48,82 @@ where const PROCESS_EVQ: u32 = 3; const PROCESS_NOTIFY_BACKEND: u32 = 4; - pub fn handle_rxq_event(&mut self, evset: EventSet) -> bool { + pub fn handle_rxq_event(&mut self, evset: EventSet) -> Vec { + let mut used_queues = Vec::new(); if evset != EventSet::IN { warn!("vsock: rxq unexpected event {:?}", evset); METRICS.rx_queue_event_fails.inc(); - return false; + return used_queues; } - let mut raise_irq = false; if let Err(err) = self.queue_events[RXQ_INDEX].read() { error!("Failed to get vsock rx queue event: {:?}", err); METRICS.rx_queue_event_fails.inc(); } else if self.backend.has_pending_rx() { - // OK to unwrap: Only QueueError::InvalidAvailIdx is returned, and we explicitly - // want to panic on that one. - raise_irq |= self.process_rx().unwrap(); + if self.process_rx().unwrap() { + used_queues.push(RXQ_INDEX.try_into().unwrap()); + } METRICS.rx_queue_event_count.inc(); } - raise_irq + used_queues } - pub fn handle_txq_event(&mut self, evset: EventSet) -> bool { + pub fn handle_txq_event(&mut self, evset: EventSet) -> Vec { + let mut used_queues = Vec::new(); if evset != EventSet::IN { warn!("vsock: txq unexpected event {:?}", evset); METRICS.tx_queue_event_fails.inc(); - return false; + return used_queues; } - let mut raise_irq = false; if let Err(err) = self.queue_events[TXQ_INDEX].read() { error!("Failed to get vsock tx queue event: {:?}", err); METRICS.tx_queue_event_fails.inc(); } else { - // OK to unwrap: Only QueueError::InvalidAvailIdx is returned, and we explicitly - // want to panic on that one. - raise_irq |= self.process_tx().unwrap(); + if self.process_tx().unwrap() { + used_queues.push(TXQ_INDEX.try_into().unwrap()); + } METRICS.tx_queue_event_count.inc(); // The backend may have queued up responses to the packets we sent during // TX queue processing. If that happened, we need to fetch those responses // and place them into RX buffers. - if self.backend.has_pending_rx() { - raise_irq |= self.process_rx().unwrap(); + if self.backend.has_pending_rx() && self.process_rx().unwrap() { + used_queues.push(RXQ_INDEX.try_into().unwrap()); } } - raise_irq + used_queues } - pub fn handle_evq_event(&mut self, evset: EventSet) -> bool { + pub fn handle_evq_event(&mut self, evset: EventSet) { if evset != EventSet::IN { warn!("vsock: evq unexpected event {:?}", evset); METRICS.ev_queue_event_fails.inc(); - return false; + return; } if let Err(err) = self.queue_events[EVQ_INDEX].read() { error!("Failed to consume vsock evq event: {:?}", err); METRICS.ev_queue_event_fails.inc(); } - false } /// Notify backend of new events. - pub fn notify_backend(&mut self, evset: EventSet) -> Result { + pub fn notify_backend(&mut self, evset: EventSet) -> Result, InvalidAvailIdx> { + let mut used_queues = Vec::new(); self.backend.notify(evset); // After the backend has been kicked, it might've freed up some resources, so we // can attempt to send it more data to process. // In particular, if `self.backend.send_pkt()` halted the TX queue processing (by // returning an error) at some point in the past, now is the time to try walking the // TX queue again. - // OK to unwrap: Only QueueError::InvalidAvailIdx is returned, and we explicitly - // want to panic on that one. - let mut raise_irq = self.process_tx()?; - if self.backend.has_pending_rx() { - raise_irq |= self.process_rx()?; + if self.process_tx()? { + used_queues.push(TXQ_INDEX.try_into().unwrap()); + } + if self.backend.has_pending_rx() && self.process_rx()? { + used_queues.push(RXQ_INDEX.try_into().unwrap()) } - Ok(raise_irq) + + Ok(used_queues) } fn register_runtime_events(&self, ops: &mut EventOps) { @@ -189,18 +191,25 @@ where let evset = event.event_set(); if self.is_activated() { - let mut raise_irq = false; - match source { - Self::PROCESS_ACTIVATE => self.handle_activate_event(ops), - Self::PROCESS_RXQ => raise_irq = self.handle_rxq_event(evset), - Self::PROCESS_TXQ => raise_irq = self.handle_txq_event(evset), - Self::PROCESS_EVQ => raise_irq = self.handle_evq_event(evset), - Self::PROCESS_NOTIFY_BACKEND => raise_irq = self.notify_backend(evset).unwrap(), - _ => warn!("Unexpected vsock event received: {:?}", source), - } - if raise_irq { - self.signal_used_queue().unwrap_or_default(); - } + let used_queues = match source { + Self::PROCESS_ACTIVATE => { + self.handle_activate_event(ops); + Vec::new() + } + Self::PROCESS_RXQ => self.handle_rxq_event(evset), + Self::PROCESS_TXQ => self.handle_txq_event(evset), + Self::PROCESS_EVQ => { + self.handle_evq_event(evset); + Vec::new() + } + Self::PROCESS_NOTIFY_BACKEND => self.notify_backend(evset).unwrap(), + _ => { + warn!("Unexpected vsock event received: {:?}", source); + Vec::new() + } + }; + self.signal_used_queues(&used_queues) + .expect("vsock: Could not trigger device interrupt"); } else { warn!( "Vsock: The device is not yet activated. Spurious event received: {:?}", @@ -240,7 +249,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); ctx.device.backend.set_pending_rx(false); ctx.signal_txq_event(); @@ -257,7 +266,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); ctx.device.backend.set_pending_rx(true); ctx.signal_txq_event(); @@ -273,7 +282,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); ctx.device.backend.set_pending_rx(false); ctx.device.backend.set_tx_err(Some(VsockError::NoData)); @@ -289,7 +298,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); // Invalidate the descriptor chain, by setting its length to 0. ctx.guest_txvq.dtable[0].len.set(0); @@ -306,9 +315,11 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); - assert!(!ctx.device.handle_txq_event(EventSet::IN)); + let metric_before = METRICS.tx_queue_event_fails.count(); + ctx.device.handle_txq_event(EventSet::IN); + assert_eq!(metric_before + 1, METRICS.tx_queue_event_fails.count()); } } @@ -321,7 +332,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); ctx.device.backend.set_pending_rx(true); ctx.device.backend.set_rx_err(Some(VsockError::NoData)); @@ -338,7 +349,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); ctx.device.backend.set_pending_rx(true); ctx.signal_rxq_event(); @@ -351,7 +362,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); // Invalidate the descriptor chain, by setting its length to 0. ctx.guest_rxvq.dtable[0].len.set(0); @@ -367,9 +378,11 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); ctx.device.backend.set_pending_rx(false); - assert!(!ctx.device.handle_rxq_event(EventSet::IN)); + let metric_before = METRICS.rx_queue_event_fails.count(); + ctx.device.handle_rxq_event(EventSet::IN); + assert_eq!(metric_before + 1, METRICS.rx_queue_event_fails.count()); } } @@ -380,7 +393,9 @@ mod tests { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); ctx.device.backend.set_pending_rx(false); - assert!(!ctx.device.handle_evq_event(EventSet::IN)); + let metric_before = METRICS.ev_queue_event_fails.count(); + ctx.device.handle_evq_event(EventSet::IN); + assert_eq!(metric_before + 1, METRICS.ev_queue_event_fails.count()); } } @@ -392,7 +407,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); ctx.device.backend.set_pending_rx(true); ctx.device.notify_backend(EventSet::IN).unwrap(); @@ -411,7 +426,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); ctx.device.backend.set_pending_rx(false); ctx.device.notify_backend(EventSet::IN).unwrap(); @@ -454,7 +469,7 @@ mod tests { { let mut ctx = test_ctx.create_event_handler_context(); - // When modifiyng the buffer descriptor, make sure the len field is altered in the + // When modifying the buffer descriptor, make sure the len field is altered in the // vsock packet header descriptor as well. if desc_idx == 1 { // The vsock packet len field has offset 24 in the header. @@ -480,8 +495,8 @@ mod tests { #[cfg(target_arch = "x86_64")] #[allow(clippy::cast_possible_truncation)] /* casting of constants we know fit into u32 */ fn test_vsock_bof() { - use crate::arch::MMIO_MEM_START; - use crate::arch::x86_64::{FIRST_ADDR_PAST_32BITS, MEM_32BIT_GAP_SIZE}; + use crate::arch::x86_64::layout::FIRST_ADDR_PAST_32BITS; + use crate::arch::{MMIO32_MEM_SIZE, MMIO32_MEM_START}; use crate::devices::virtio::vsock::packet::VSOCK_PKT_HDR_SIZE; use crate::test_utils::multi_region_mem; use crate::utils::mib_to_bytes; @@ -492,7 +507,7 @@ mod tests { let mut test_ctx = TestContext::new(); test_ctx.mem = multi_region_mem(&[ (GuestAddress(0), 8 * MIB), - (GuestAddress(MMIO_MEM_START - MIB as u64), MIB), + (GuestAddress(MMIO32_MEM_START - MIB as u64), MIB), (GuestAddress(FIRST_ADDR_PAST_32BITS), MIB), ]); @@ -515,15 +530,15 @@ mod tests { } // Let's check what happens when the header descriptor is right before the gap. - vsock_bof_helper(&mut test_ctx, 0, MMIO_MEM_START - 1, VSOCK_PKT_HDR_SIZE); + vsock_bof_helper(&mut test_ctx, 0, MMIO32_MEM_START - 1, VSOCK_PKT_HDR_SIZE); // Let's check what happens when the buffer descriptor crosses into the gap, but does // not go past its right edge. vsock_bof_helper( &mut test_ctx, 1, - MMIO_MEM_START - 4, - MEM_32BIT_GAP_SIZE as u32 + 4, + MMIO32_MEM_START - 4, + MMIO32_MEM_SIZE as u32 + 4, ); // Let's modify the buffer descriptor addr and len such that it crosses over the MMIO gap, @@ -531,8 +546,8 @@ mod tests { vsock_bof_helper( &mut test_ctx, 1, - MMIO_MEM_START - 4, - MEM_32BIT_GAP_SIZE as u32 + 100, + MMIO32_MEM_START - 4, + MMIO32_MEM_SIZE as u32 + 100, ); } @@ -582,7 +597,7 @@ mod tests { vsock .lock() .unwrap() - .activate(test_ctx.mem.clone()) + .activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()) .unwrap(); // Process the activate event. let ev_count = event_manager.run_with_timeout(50).unwrap(); diff --git a/src/vmm/src/devices/virtio/vsock/persist.rs b/src/vmm/src/devices/virtio/vsock/persist.rs index fce6affae69..6775707da3e 100644 --- a/src/vmm/src/devices/virtio/vsock/persist.rs +++ b/src/vmm/src/devices/virtio/vsock/persist.rs @@ -5,14 +5,14 @@ use std::fmt::Debug; use std::sync::Arc; -use std::sync::atomic::AtomicU32; use serde::{Deserialize, Serialize}; use super::*; -use crate::devices::virtio::device::DeviceState; +use crate::devices::virtio::device::{ActiveState, DeviceState}; use crate::devices::virtio::persist::VirtioDeviceState; use crate::devices::virtio::queue::FIRECRACKER_MAX_QUEUE_SIZE; +use crate::devices::virtio::transport::VirtioInterrupt; use crate::devices::virtio::vsock::TYPE_VSOCK; use crate::snapshot::Persist; use crate::vstate::memory::GuestMemoryMmap; @@ -29,9 +29,9 @@ pub struct VsockState { /// The Vsock frontend serializable state. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct VsockFrontendState { - /// Context IDentifier. + /// Context Identifier. pub cid: u64, - virtio_state: VirtioDeviceState, + pub virtio_state: VirtioDeviceState, } /// An enum for the serializable backend state types. @@ -121,13 +121,7 @@ where vsock.acked_features = state.virtio_state.acked_features; vsock.avail_features = state.virtio_state.avail_features; - vsock.irq_trigger.irq_status = - Arc::new(AtomicU32::new(state.virtio_state.interrupt_status)); - vsock.device_state = if state.virtio_state.activated { - DeviceState::Activated(constructor_args.mem) - } else { - DeviceState::Inactive - }; + vsock.device_state = DeviceState::Inactive; Ok(vsock) } } @@ -137,6 +131,7 @@ pub(crate) mod tests { use super::device::AVAIL_FEATURES; use super::*; use crate::devices::virtio::device::VirtioDevice; + use crate::devices::virtio::test_utils::default_interrupt; use crate::devices::virtio::vsock::defs::uapi; use crate::devices::virtio::vsock::test_utils::{TestBackend, TestContext}; use crate::snapshot::Snapshot; diff --git a/src/vmm/src/devices/virtio/vsock/test_utils.rs b/src/vmm/src/devices/virtio/vsock/test_utils.rs index 921c2e79bdb..b38ce070c66 100644 --- a/src/vmm/src/devices/virtio/vsock/test_utils.rs +++ b/src/vmm/src/devices/virtio/vsock/test_utils.rs @@ -5,6 +5,7 @@ #![doc(hidden)] use std::os::unix::io::{AsRawFd, RawFd}; +use std::sync::Arc; use vmm_sys_util::epoll::EventSet; use vmm_sys_util::eventfd::EventFd; @@ -12,7 +13,8 @@ use vmm_sys_util::eventfd::EventFd; use super::packet::{VsockPacketRx, VsockPacketTx}; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::queue::{VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; -use crate::devices::virtio::test_utils::VirtQueue as GuestQ; +use crate::devices::virtio::test_utils::{VirtQueue as GuestQ, default_interrupt}; +use crate::devices::virtio::transport::VirtioInterrupt; use crate::devices::virtio::vsock::device::{RXQ_INDEX, TXQ_INDEX}; use crate::devices::virtio::vsock::packet::VSOCK_PKT_HDR_SIZE; use crate::devices::virtio::vsock::{ @@ -117,6 +119,7 @@ impl VsockBackend for TestBackend {} pub struct TestContext { pub cid: u64, pub mem: GuestMemoryMmap, + pub interrupt: Arc, pub mem_size: usize, pub device: Vsock, } @@ -134,6 +137,7 @@ impl TestContext { Self { cid: CID, mem, + interrupt: default_interrupt(), mem_size: MEM_SIZE, device, } @@ -196,9 +200,9 @@ pub struct EventHandlerContext<'a> { } impl EventHandlerContext<'_> { - pub fn mock_activate(&mut self, mem: GuestMemoryMmap) { + pub fn mock_activate(&mut self, mem: GuestMemoryMmap, interrupt: Arc) { // Artificially activate the device. - self.device.activate(mem).unwrap(); + self.device.activate(mem, interrupt).unwrap(); } pub fn signal_txq_event(&mut self) { diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 2a923637e93..7bb33411b7e 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -121,11 +121,11 @@ use std::sync::mpsc::RecvTimeoutError; use std::sync::{Arc, Barrier, Mutex}; use std::time::Duration; -use device_manager::acpi::ACPIDeviceManager; -use device_manager::resources::ResourceAllocator; +use device_manager::DeviceManager; use devices::acpi::vmgenid::VmGenIdError; use event_manager::{EventManager as BaseEventManager, EventOps, Events, MutEventSubscriber}; use seccomp::BpfProgram; +use snapshot::Persist; use userfaultfd::Uffd; use vmm_sys_util::epoll::EventSet; use vmm_sys_util::eventfd::EventFd; @@ -133,12 +133,7 @@ use vmm_sys_util::terminal::Terminal; use vstate::kvm::Kvm; use vstate::vcpu::{self, StartThreadedError, VcpuSendEventError}; -use crate::arch::DeviceType; use crate::cpu_config::templates::CpuConfiguration; -#[cfg(target_arch = "x86_64")] -use crate::device_manager::legacy::PortIODeviceManager; -use crate::device_manager::mmio::MMIODeviceManager; -use crate::devices::legacy::{IER_RDA_BIT, IER_RDA_OFFSET}; use crate::devices::virtio::balloon::{ BALLOON_DEV_ID, Balloon, BalloonConfig, BalloonError, BalloonStats, }; @@ -148,7 +143,6 @@ use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET}; use crate::logger::{METRICS, MetricsError, error, info, warn}; use crate::persist::{MicrovmState, MicrovmStateError, VmInfo}; use crate::rate_limiter::BucketUpdate; -use crate::snapshot::Persist; use crate::vmm_config::instance_info::{InstanceInfo, VmState}; use crate::vstate::memory::{GuestMemory, GuestMemoryMmap, GuestMemoryRegion}; use crate::vstate::vcpu::VcpuState; @@ -205,17 +199,15 @@ pub const HTTP_MAX_PAYLOAD_SIZE: usize = 51200; /// have permissions to open the KVM fd). #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum VmmError { - /// Failed to allocate guest resource: {0} - AllocateResources(#[from] vm_allocator::Error), #[cfg(target_arch = "aarch64")] /// Invalid command line error. Cmdline, /// Device manager error: {0} - DeviceManager(device_manager::mmio::MmioError), + DeviceManager(#[from] device_manager::DeviceManagerCreateError), + /// MMIO Device manager error: {0} + MmioDeviceManager(device_manager::mmio::MmioError), /// Error getting the KVM dirty bitmap. {0} DirtyBitmap(kvm_ioctls::Error), - /// Event fd error: {0} - EventFd(io::Error), /// I8042 error: {0} I8042Error(devices::legacy::I8042DeviceError), #[cfg(target_arch = "x86_64")] @@ -259,6 +251,8 @@ pub enum VmmError { VmmObserverTeardown(vmm_sys_util::errno::Error), /// VMGenID error: {0} VMGenID(#[from] VmGenIdError), + /// Failed perform action on device: {0} + FindDeviceError(#[from] device_manager::FindDeviceError), } /// Shorthand type for KVM dirty page bitmap. @@ -307,20 +301,15 @@ pub struct Vmm { // Guest VM core resources. kvm: Kvm, /// VM object - pub vm: Vm, + pub vm: Arc, // Save UFFD in order to keep it open in the Firecracker process, as well. + #[allow(unused)] uffd: Option, vcpus_handles: Vec, // Used by Vcpus and devices to initiate teardown; Vmm should never write here. vcpus_exit_evt: EventFd, - - // Allocator for guest resources - resource_allocator: ResourceAllocator, - // Guest VM devices. - mmio_device_manager: MMIODeviceManager, - #[cfg(target_arch = "x86_64")] - pio_device_manager: PortIODeviceManager, - acpi_device_manager: ACPIDeviceManager, + // Device manager + device_manager: DeviceManager, } impl Vmm { @@ -339,15 +328,6 @@ impl Vmm { self.shutdown_exit_code } - /// Gets the specified bus device. - pub fn get_bus_device( - &self, - device_type: DeviceType, - device_id: &str, - ) -> Option<&Mutex> { - self.mmio_device_manager.get_device(device_type, device_id) - } - /// Starts the microVM vcpus. /// /// # Errors @@ -378,10 +358,9 @@ impl Vmm { self.vcpus_handles.reserve(vcpu_count); for mut vcpu in vcpus.drain(..) { - vcpu.set_mmio_bus(self.mmio_device_manager.bus.clone()); + vcpu.set_mmio_bus(self.vm.common.mmio_bus.clone()); #[cfg(target_arch = "x86_64")] - vcpu.kvm_vcpu - .set_pio_bus(self.pio_device_manager.io_bus.clone()); + vcpu.kvm_vcpu.set_pio_bus(self.vm.pio_bus.clone()); self.vcpus_handles .push(vcpu.start_threaded(vcpu_seccomp_filter.clone(), barrier.clone())?); @@ -395,7 +374,7 @@ impl Vmm { /// Sends a resume command to the vCPUs. pub fn resume_vm(&mut self) -> Result<(), VmmError> { - self.mmio_device_manager.kick_devices(); + self.device_manager.kick_virtio_devices(); // Send the events. self.vcpus_handles @@ -439,60 +418,14 @@ impl Vmm { Ok(()) } - /// Sets RDA bit in serial console - pub fn emulate_serial_init(&self) -> Result<(), EmulateSerialInitError> { - // When restoring from a previously saved state, there is no serial - // driver initialization, therefore the RDA (Received Data Available) - // interrupt is not enabled. Because of that, the driver won't get - // notified of any bytes that we send to the guest. The clean solution - // would be to save the whole serial device state when we do the vm - // serialization. For now we set that bit manually - - #[cfg(target_arch = "aarch64")] - { - let serial_bus_device = self.get_bus_device(DeviceType::Serial, "Serial"); - if serial_bus_device.is_none() { - return Ok(()); - } - let mut serial_device_locked = - serial_bus_device.unwrap().lock().expect("Poisoned lock"); - let serial = serial_device_locked - .serial_mut() - .expect("Unexpected BusDeviceType"); - - serial - .serial - .write(IER_RDA_OFFSET, IER_RDA_BIT) - .map_err(|_| EmulateSerialInitError(std::io::Error::last_os_error()))?; - Ok(()) - } - - #[cfg(target_arch = "x86_64")] - { - let mut guard = self - .pio_device_manager - .stdio_serial - .lock() - .expect("Poisoned lock"); - let serial = guard.serial_mut().unwrap(); - - serial - .serial - .write(IER_RDA_OFFSET, IER_RDA_BIT) - .map_err(|_| EmulateSerialInitError(std::io::Error::last_os_error()))?; - Ok(()) - } - } - /// Injects CTRL+ALT+DEL keystroke combo in the i8042 device. #[cfg(target_arch = "x86_64")] pub fn send_ctrl_alt_del(&mut self) -> Result<(), VmmError> { - self.pio_device_manager + self.device_manager + .legacy_devices .i8042 .lock() .expect("i8042 lock was poisoned") - .i8042_device_mut() - .unwrap() .trigger_ctrl_alt_del() .map_err(VmmError::I8042Error) } @@ -514,9 +447,7 @@ impl Vmm { self.vm.save_state(&mpidrs).map_err(SaveVmState)? } }; - let device_states = self.mmio_device_manager.save(); - - let acpi_dev_state = self.acpi_device_manager.save(); + let device_states = self.device_manager.save(); Ok(MicrovmState { vm_info: vm_info.clone(), @@ -524,7 +455,6 @@ impl Vmm { vm_state, vcpu_states, device_states, - acpi_dev_state, }) } @@ -591,13 +521,13 @@ impl Vmm { drive_id: &str, path_on_host: String, ) -> Result<(), VmmError> { - self.mmio_device_manager + self.device_manager .with_virtio_device_with_id(TYPE_BLOCK, drive_id, |block: &mut Block| { block .update_disk_image(path_on_host) .map_err(|err| err.to_string()) }) - .map_err(VmmError::DeviceManager) + .map_err(VmmError::FindDeviceError) } /// Updates the rate limiter parameters for block device with `drive_id` id. @@ -607,22 +537,22 @@ impl Vmm { rl_bytes: BucketUpdate, rl_ops: BucketUpdate, ) -> Result<(), VmmError> { - self.mmio_device_manager + self.device_manager .with_virtio_device_with_id(TYPE_BLOCK, drive_id, |block: &mut Block| { block .update_rate_limiter(rl_bytes, rl_ops) .map_err(|err| err.to_string()) }) - .map_err(VmmError::DeviceManager) + .map_err(VmmError::FindDeviceError) } /// Updates the rate limiter parameters for block device with `drive_id` id. pub fn update_vhost_user_block_config(&mut self, drive_id: &str) -> Result<(), VmmError> { - self.mmio_device_manager + self.device_manager .with_virtio_device_with_id(TYPE_BLOCK, drive_id, |block: &mut Block| { block.update_config().map_err(|err| err.to_string()) }) - .map_err(VmmError::DeviceManager) + .map_err(VmmError::FindDeviceError) } /// Updates the rate limiter parameters for net device with `net_id` id. @@ -634,25 +564,20 @@ impl Vmm { tx_bytes: BucketUpdate, tx_ops: BucketUpdate, ) -> Result<(), VmmError> { - self.mmio_device_manager + self.device_manager .with_virtio_device_with_id(TYPE_NET, net_id, |net: &mut Net| { net.patch_rate_limiters(rx_bytes, rx_ops, tx_bytes, tx_ops); Ok(()) }) - .map_err(VmmError::DeviceManager) + .map_err(VmmError::FindDeviceError) } /// Returns a reference to the balloon device if present. pub fn balloon_config(&self) -> Result { - if let Some(busdev) = self.get_bus_device(DeviceType::Virtio(TYPE_BALLOON), BALLOON_DEV_ID) + if let Some(virtio_device) = self + .device_manager + .get_virtio_device(TYPE_BALLOON, BALLOON_DEV_ID) { - let virtio_device = busdev - .lock() - .expect("Poisoned lock") - .mmio_transport_ref() - .expect("Unexpected device type") - .device(); - let config = virtio_device .lock() .expect("Poisoned lock") @@ -669,15 +594,10 @@ impl Vmm { /// Returns the latest balloon statistics if they are enabled. pub fn latest_balloon_stats(&self) -> Result { - if let Some(busdev) = self.get_bus_device(DeviceType::Virtio(TYPE_BALLOON), BALLOON_DEV_ID) + if let Some(virtio_device) = self + .device_manager + .get_virtio_device(TYPE_BALLOON, BALLOON_DEV_ID) { - let virtio_device = busdev - .lock() - .expect("Poisoned lock") - .mmio_transport_ref() - .expect("Unexpected device type") - .device(); - let latest_stats = virtio_device .lock() .expect("Poisoned lock") @@ -702,16 +622,11 @@ impl Vmm { return Err(BalloonError::TooManyPagesRequested); } - if let Some(busdev) = self.get_bus_device(DeviceType::Virtio(TYPE_BALLOON), BALLOON_DEV_ID) + if let Some(virtio_device) = self + .device_manager + .get_virtio_device(TYPE_BALLOON, BALLOON_DEV_ID) { { - let virtio_device = busdev - .lock() - .expect("Poisoned lock") - .mmio_transport_ref() - .expect("Unexpected device type") - .device(); - virtio_device .lock() .expect("Poisoned lock") @@ -732,16 +647,11 @@ impl Vmm { &mut self, stats_polling_interval_s: u16, ) -> Result<(), BalloonError> { - if let Some(busdev) = self.get_bus_device(DeviceType::Virtio(TYPE_BALLOON), BALLOON_DEV_ID) + if let Some(virtio_device) = self + .device_manager + .get_virtio_device(TYPE_BALLOON, BALLOON_DEV_ID) { { - let virtio_device = busdev - .lock() - .expect("Poisoned lock") - .mmio_transport_ref() - .expect("Unexpected device type") - .device(); - virtio_device .lock() .expect("Poisoned lock") diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index 1ff158d9973..b78d69fcdec 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -25,7 +25,7 @@ use crate::cpu_config::templates::StaticCpuTemplate; use crate::cpu_config::x86_64::cpuid::CpuidTrait; #[cfg(target_arch = "x86_64")] use crate::cpu_config::x86_64::cpuid::common::get_vendor_id_from_host; -use crate::device_manager::persist::{ACPIDeviceManagerState, DevicePersistError, DeviceStates}; +use crate::device_manager::{DevicePersistError, DevicesState}; use crate::logger::{info, warn}; use crate::resources::VmResources; use crate::seccomp::BpfThreadMap; @@ -69,7 +69,7 @@ impl From<&VmResources> for VmInfo { } } -/// Contains the necesary state for saving/restoring a microVM. +/// Contains the necessary state for saving/restoring a microVM. #[derive(Debug, Default, Serialize, Deserialize)] pub struct MicrovmState { /// Miscellaneous VM info. @@ -81,9 +81,7 @@ pub struct MicrovmState { /// Vcpu states. pub vcpu_states: Vec, /// Device states. - pub device_states: DeviceStates, - /// ACPI devices state. - pub acpi_dev_state: ACPIDeviceManagerState, + pub device_states: DevicesState, } /// This describes the mapping between Firecracker base virtual address and @@ -118,7 +116,7 @@ pub enum MicrovmStateError { /// Operation not allowed: {0} NotAllowed(String), /// Cannot restore devices: {0} - RestoreDevices(DevicePersistError), + RestoreDevices(#[from] DevicePersistError), /// Cannot save Vcpu state: {0} SaveVcpuState(vstate::vcpu::VcpuError), /// Cannot save Vm state: {0} @@ -168,19 +166,8 @@ pub fn create_snapshot( // We need to mark queues as dirty again for all activated devices. The reason we // do it here is that we don't mark pages as dirty during runtime // for queue objects. - // SAFETY: - // This should never fail as we only mark pages only if device has already been activated, - // and the address validation was already performed on device activation. - vmm.mmio_device_manager - .for_each_virtio_device(|_, _, _, dev| { - let mut d = dev.lock().unwrap(); - if d.is_activated() { - d.mark_queue_memory_dirty(vmm.vm.guest_memory()) - } else { - Ok(()) - } - }) - .unwrap(); + vmm.device_manager + .mark_virtio_queue_memory_dirty(vmm.vm.guest_memory()); Ok(()) } @@ -334,18 +321,23 @@ pub fn restore_from_snapshot( ) -> Result>, RestoreFromSnapshotError> { let mut microvm_state = snapshot_state_from_file(¶ms.snapshot_path)?; for entry in ¶ms.network_overrides { - let net_devices = &mut microvm_state.device_states.net_devices; - if let Some(device) = net_devices + microvm_state + .device_states + .mmio_state + .net_devices .iter_mut() - .find(|x| x.device_state.id == entry.iface_id) - { - device - .device_state - .tap_if_name - .clone_from(&entry.host_dev_name); - } else { - return Err(SnapshotStateFromFileError::UnknownNetworkDevice.into()); - } + .map(|device| &mut device.device_state) + .chain( + microvm_state + .device_states + .pci_state + .net_devices + .iter_mut() + .map(|device| &mut device.device_state), + ) + .find(|x| x.id == entry.iface_id) + .map(|device_state| device_state.tap_if_name.clone_from(&entry.host_dev_name)) + .ok_or(SnapshotStateFromFileError::UnknownNetworkDevice)?; } let track_dirty_pages = params.track_dirty_pages; @@ -660,14 +652,14 @@ mod tests { #[test] fn test_microvm_state_snapshot() { let vmm = default_vmm_with_devices(); - let states = vmm.mmio_device_manager.save(); + let states = vmm.device_manager.save(); // Only checking that all devices are saved, actual device state // is tested by that device's tests. - assert_eq!(states.block_devices.len(), 1); - assert_eq!(states.net_devices.len(), 1); - assert!(states.vsock_device.is_some()); - assert!(states.balloon_device.is_some()); + assert_eq!(states.mmio_state.block_devices.len(), 1); + assert_eq!(states.mmio_state.net_devices.len(), 1); + assert!(states.mmio_state.vsock_device.is_some()); + assert!(states.mmio_state.balloon_device.is_some()); let vcpu_states = vec![VcpuState::default()]; #[cfg(target_arch = "aarch64")] @@ -684,7 +676,6 @@ mod tests { vm_state: vmm.vm.save_state(&mpidrs).unwrap(), #[cfg(target_arch = "x86_64")] vm_state: vmm.vm.save_state().unwrap(), - acpi_dev_state: vmm.acpi_device_manager.save(), }; let mut buf = vec![0; 10000]; @@ -695,8 +686,8 @@ mod tests { assert_eq!(restored_microvm_state.vm_info, microvm_state.vm_info); assert_eq!( - restored_microvm_state.device_states, - microvm_state.device_states + restored_microvm_state.device_states.mmio_state, + microvm_state.device_states.mmio_state ) } diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index 00199fd1fe2..d29f76740fc 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -114,6 +114,8 @@ pub struct VmResources { pub mmds_size_limit: usize, /// Whether or not to load boot timer device. pub boot_timer: bool, + /// Whether or not to use PCIe transport for VirtIO devices. + pub pci_enabled: bool, } impl VmResources { @@ -473,7 +475,7 @@ impl VmResources { // a single way of backing guest memory for vhost-user and non-vhost-user cases, // that would not be worth the effort. let regions = - crate::arch::arch_memory_regions(0, mib_to_bytes(self.machine_config.mem_size_mib)); + crate::arch::arch_memory_regions(mib_to_bytes(self.machine_config.mem_size_mib)); if vhost_user_device_used { memory::memfd_backed( regions.as_ref(), @@ -614,6 +616,7 @@ mod tests { boot_timer: false, mmds_size_limit: HTTP_MAX_PAYLOAD_SIZE, entropy: Default::default(), + pci_enabled: false, } } diff --git a/src/vmm/src/rpc_interface.rs b/src/vmm/src/rpc_interface.rs index e015152470e..d26b1ba877d 100644 --- a/src/vmm/src/rpc_interface.rs +++ b/src/vmm/src/rpc_interface.rs @@ -331,18 +331,16 @@ impl<'a> PrebootApiController<'a> { to_api: &std::sync::mpsc::Sender, api_event_fd: &vmm_sys_util::eventfd::EventFd, boot_timer_enabled: bool, + pci_enabled: bool, mmds_size_limit: usize, metadata_json: Option<&str>, ) -> Result<(VmResources, Arc>), BuildMicrovmFromRequestsError> { - let mut vm_resources = VmResources::default(); - // Silence false clippy warning. Clippy suggests using - // VmResources { boot_timer: boot_timer_enabled, ..Default::default() }; but this will - // generate build errors because VmResources contains private fields. - #[allow(clippy::field_reassign_with_default)] - { - vm_resources.mmds_size_limit = mmds_size_limit; - vm_resources.boot_timer = boot_timer_enabled; - } + let mut vm_resources = VmResources { + boot_timer: boot_timer_enabled, + mmds_size_limit, + pci_enabled, + ..Default::default() + }; // Init the data store from file, if present. if let Some(data) = metadata_json { diff --git a/src/vmm/src/test_utils/mod.rs b/src/vmm/src/test_utils/mod.rs index ae2c4a9bd3b..2cfcc274b5d 100644 --- a/src/vmm/src/test_utils/mod.rs +++ b/src/vmm/src/test_utils/mod.rs @@ -58,17 +58,18 @@ pub fn multi_region_mem_raw(regions: &[(GuestAddress, usize)]) -> Vec GuestMemoryMmap { - multi_region_mem(&crate::arch::arch_memory_regions(0, mem_size_bytes)) + multi_region_mem(&crate::arch::arch_memory_regions(mem_size_bytes)) } pub fn arch_mem_raw(mem_size_bytes: usize) -> Vec { - multi_region_mem_raw(&crate::arch::arch_memory_regions(0, mem_size_bytes)) + multi_region_mem_raw(&crate::arch::arch_memory_regions(mem_size_bytes)) } pub fn create_vmm( _kernel_image: Option<&str>, is_diff: bool, boot_microvm: bool, + pci_enabled: bool, ) -> (Arc>, EventManager) { let mut event_manager = EventManager::new().unwrap(); let empty_seccomp_filters = get_empty_filters(); @@ -82,7 +83,7 @@ pub fn create_vmm( None => boot_source_cfg.into(), }; let mock_vm_res = MockVmResources::new().with_boot_source(boot_source_cfg); - let resources: VmResources = if is_diff { + let mut resources: VmResources = if is_diff { mock_vm_res .with_vm_config(MockVmConfig::new().with_dirty_page_tracking().into()) .into() @@ -90,6 +91,8 @@ pub fn create_vmm( mock_vm_res.into() }; + resources.pci_enabled = pci_enabled; + let vmm = build_microvm_for_boot( &InstanceInfo::default(), &resources, @@ -106,15 +109,23 @@ pub fn create_vmm( } pub fn default_vmm(kernel_image: Option<&str>) -> (Arc>, EventManager) { - create_vmm(kernel_image, false, true) + create_vmm(kernel_image, false, true, false) } pub fn default_vmm_no_boot(kernel_image: Option<&str>) -> (Arc>, EventManager) { - create_vmm(kernel_image, false, false) + create_vmm(kernel_image, false, false, false) +} + +pub fn default_vmm_pci_no_boot(kernel_image: Option<&str>) -> (Arc>, EventManager) { + create_vmm(kernel_image, false, false, true) } pub fn dirty_tracking_vmm(kernel_image: Option<&str>) -> (Arc>, EventManager) { - create_vmm(kernel_image, true, true) + create_vmm(kernel_image, true, true, false) +} + +pub fn default_vmm_pci(kernel_image: Option<&str>) -> (Arc>, EventManager) { + create_vmm(kernel_image, false, true, false) } #[allow(clippy::undocumented_unsafe_blocks)] diff --git a/src/vmm/src/vmm_config/boot_source.rs b/src/vmm/src/vmm_config/boot_source.rs index 37ba08be449..dc21523af3c 100644 --- a/src/vmm/src/vmm_config/boot_source.rs +++ b/src/vmm/src/vmm_config/boot_source.rs @@ -9,14 +9,14 @@ use serde::{Deserialize, Serialize}; /// Default guest kernel command line: /// - `reboot=k` shut down the guest on reboot, instead of well... rebooting; /// - `panic=1` on panic, reboot after 1 second; -/// - `pci=off` do not scan for PCI devices (save boot time); /// - `nomodule` disable loadable kernel module support; /// - `8250.nr_uarts=0` disable 8250 serial interface; /// - `i8042.noaux` do not probe the i8042 controller for an attached mouse (save boot time); /// - `i8042.nomux` do not probe i8042 for a multiplexing controller (save boot time); /// - `i8042.dumbkbd` do not attempt to control kbd state via the i8042 (save boot time). -pub const DEFAULT_KERNEL_CMDLINE: &str = - "reboot=k panic=1 pci=off nomodule 8250.nr_uarts=0 i8042.noaux i8042.nomux i8042.dumbkbd"; +/// - `swiotlb=noforce` disable software bounce buffers (SWIOTLB) +pub const DEFAULT_KERNEL_CMDLINE: &str = "reboot=k panic=1 nomodule 8250.nr_uarts=0 i8042.noaux \ + i8042.nomux i8042.dumbkbd swiotlb=noforce"; /// Strongly typed data structure used to configure the boot source of the /// microvm. diff --git a/src/vmm/src/vstate/mod.rs b/src/vmm/src/vstate/mod.rs index 47458835e04..f4fa25914d0 100644 --- a/src/vmm/src/vstate/mod.rs +++ b/src/vmm/src/vstate/mod.rs @@ -5,6 +5,8 @@ pub mod kvm; /// Module with GuestMemory implementation. pub mod memory; +/// Resource manager for devices. +pub mod resources; /// Module with Vcpu implementation. pub mod vcpu; /// Module with Vm implementation. diff --git a/src/vmm/src/vstate/resources.rs b/src/vmm/src/vstate/resources.rs new file mode 100644 index 00000000000..545b211699f --- /dev/null +++ b/src/vmm/src/vstate/resources.rs @@ -0,0 +1,342 @@ +// Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::convert::Infallible; + +use serde::{Deserialize, Serialize}; +pub use vm_allocator::AllocPolicy; +use vm_allocator::{AddressAllocator, IdAllocator}; + +use crate::arch; +use crate::snapshot::Persist; + +/// Helper function to allocate many ids from an id allocator +fn allocate_many_ids( + id_allocator: &mut IdAllocator, + count: u32, +) -> Result, vm_allocator::Error> { + let mut ids = Vec::with_capacity(count as usize); + + for _ in 0..count { + match id_allocator.allocate_id() { + Ok(id) => ids.push(id), + Err(err) => { + // It is ok to unwrap here, we just allocated the GSI + ids.into_iter().for_each(|id| { + id_allocator.free_id(id).unwrap(); + }); + return Err(err); + } + } + } + + Ok(ids) +} + +/// A resource manager for (de)allocating interrupt lines (GSIs) and guest memory +/// +/// At the moment, we support: +/// +/// * GSIs for legacy x86_64 devices +/// * GSIs for MMIO devicecs +/// * Memory allocations in the MMIO address space +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ResourceAllocator { + /// Allocator for legacy device interrupt lines + pub gsi_legacy_allocator: IdAllocator, + /// Allocator for PCI device GSIs + pub gsi_msi_allocator: IdAllocator, + /// Allocator for memory in the 32-bit MMIO address space + pub mmio32_memory: AddressAllocator, + /// Allocator for memory in the 64-bit MMIO address space + pub mmio64_memory: AddressAllocator, + /// Memory allocator for system data + pub system_memory: AddressAllocator, +} + +impl Default for ResourceAllocator { + fn default() -> Self { + ResourceAllocator::new() + } +} + +impl ResourceAllocator { + /// Create a new resource allocator for Firecracker devices + pub fn new() -> Self { + // It is fine for us to unwrap the following since we know we are passing valid ranges for + // all allocators + Self { + gsi_legacy_allocator: IdAllocator::new(arch::GSI_LEGACY_START, arch::GSI_LEGACY_END) + .unwrap(), + gsi_msi_allocator: IdAllocator::new(arch::GSI_MSI_START, arch::GSI_MSI_END).unwrap(), + mmio32_memory: AddressAllocator::new( + arch::MEM_32BIT_DEVICES_START, + arch::MEM_32BIT_DEVICES_SIZE, + ) + .unwrap(), + mmio64_memory: AddressAllocator::new( + arch::MEM_64BIT_DEVICES_START, + arch::MEM_64BIT_DEVICES_SIZE, + ) + .unwrap(), + system_memory: AddressAllocator::new(arch::SYSTEM_MEM_START, arch::SYSTEM_MEM_SIZE) + .unwrap(), + } + } + + /// Allocate a number of legacy GSIs + /// + /// # Arguments + /// + /// * `gsi_count` - The number of legacy GSIs to allocate + pub fn allocate_gsi_legacy(&mut self, gsi_count: u32) -> Result, vm_allocator::Error> { + allocate_many_ids(&mut self.gsi_legacy_allocator, gsi_count) + } + + /// Allocate a number of GSIs for MSI + /// + /// # Arguments + /// + /// * `gsi_count` - The number of GSIs to allocate + pub fn allocate_gsi_msi(&mut self, gsi_count: u32) -> Result, vm_allocator::Error> { + allocate_many_ids(&mut self.gsi_msi_allocator, gsi_count) + } + + /// Allocate a memory range in 32-bit MMIO address space + /// + /// If it succeeds, it returns the first address of the allocated range + /// + /// # Arguments + /// + /// * `size` - The size in bytes of the memory to allocate + /// * `alignment` - The alignment of the address of the first byte + /// * `policy` - A [`vm_allocator::AllocPolicy`] variant for determining the allocation policy + pub fn allocate_32bit_mmio_memory( + &mut self, + size: u64, + alignment: u64, + policy: AllocPolicy, + ) -> Result { + Ok(self + .mmio32_memory + .allocate(size, alignment, policy)? + .start()) + } + + /// Allocate a memory range in 64-bit MMIO address space + /// + /// If it succeeds, it returns the first address of the allocated range + /// + /// # Arguments + /// + /// * `size` - The size in bytes of the memory to allocate + /// * `alignment` - The alignment of the address of the first byte + /// * `policy` - A [`vm_allocator::AllocPolicy`] variant for determining the allocation policy + pub fn allocate_64bit_mmio_memory( + &mut self, + size: u64, + alignment: u64, + policy: AllocPolicy, + ) -> Result { + Ok(self + .mmio64_memory + .allocate(size, alignment, policy)? + .start()) + } + + /// Allocate a memory range for system data + /// + /// If it succeeds, it returns the first address of the allocated range + /// + /// # Arguments + /// + /// * `size` - The size in bytes of the memory to allocate + /// * `alignment` - The alignment of the address of the first byte + /// * `policy` - A [`vm_allocator::AllocPolicy`] variant for determining the allocation policy + pub fn allocate_system_memory( + &mut self, + size: u64, + alignment: u64, + policy: AllocPolicy, + ) -> Result { + Ok(self + .system_memory + .allocate(size, alignment, policy)? + .start()) + } +} + +impl<'a> Persist<'a> for ResourceAllocator { + type State = ResourceAllocator; + type ConstructorArgs = (); + type Error = Infallible; + + fn save(&self) -> Self::State { + self.clone() + } + + fn restore( + _constructor_args: Self::ConstructorArgs, + state: &Self::State, + ) -> std::result::Result { + Ok(state.clone()) + } +} + +#[cfg(test)] +mod tests { + use vm_allocator::AllocPolicy; + + use super::ResourceAllocator; + use crate::arch::{self, GSI_LEGACY_NUM, GSI_LEGACY_START, GSI_MSI_NUM, GSI_MSI_START}; + use crate::snapshot::{Persist, Snapshot}; + + #[test] + fn test_allocate_irq() { + let mut allocator = ResourceAllocator::new(); + // asking for 0 IRQs should return us an empty vector + assert_eq!(allocator.allocate_gsi_legacy(0), Ok(vec![])); + // We cannot allocate more GSIs than available + assert_eq!( + allocator.allocate_gsi_legacy(GSI_LEGACY_NUM + 1), + Err(vm_allocator::Error::ResourceNotAvailable) + ); + // But allocating all of them at once should work + assert_eq!( + allocator.allocate_gsi_legacy(GSI_LEGACY_NUM), + Ok((arch::GSI_LEGACY_START..=arch::GSI_LEGACY_END).collect::>()) + ); + // And now we ran out of GSIs + assert_eq!( + allocator.allocate_gsi_legacy(1), + Err(vm_allocator::Error::ResourceNotAvailable) + ); + // But we should be able to ask for 0 GSIs + assert_eq!(allocator.allocate_gsi_legacy(0), Ok(vec![])); + + let mut allocator = ResourceAllocator::new(); + // We should be able to allocate 1 GSI + assert_eq!( + allocator.allocate_gsi_legacy(1), + Ok(vec![arch::GSI_LEGACY_START]) + ); + // We can't allocate MAX_IRQS any more + assert_eq!( + allocator.allocate_gsi_legacy(GSI_LEGACY_NUM), + Err(vm_allocator::Error::ResourceNotAvailable) + ); + // We can allocate another one and it should be the second available + assert_eq!( + allocator.allocate_gsi_legacy(1), + Ok(vec![arch::GSI_LEGACY_START + 1]) + ); + // Let's allocate the rest in a loop + for i in arch::GSI_LEGACY_START + 2..=arch::GSI_LEGACY_END { + assert_eq!(allocator.allocate_gsi_legacy(1), Ok(vec![i])); + } + } + + #[test] + fn test_allocate_gsi() { + let mut allocator = ResourceAllocator::new(); + // asking for 0 IRQs should return us an empty vector + assert_eq!(allocator.allocate_gsi_msi(0), Ok(vec![])); + // We cannot allocate more GSIs than available + assert_eq!( + allocator.allocate_gsi_msi(GSI_MSI_NUM + 1), + Err(vm_allocator::Error::ResourceNotAvailable) + ); + // But allocating all of them at once should work + assert_eq!( + allocator.allocate_gsi_msi(GSI_MSI_NUM), + Ok((arch::GSI_MSI_START..=arch::GSI_MSI_END).collect::>()) + ); + // And now we ran out of GSIs + assert_eq!( + allocator.allocate_gsi_msi(1), + Err(vm_allocator::Error::ResourceNotAvailable) + ); + // But we should be able to ask for 0 GSIs + assert_eq!(allocator.allocate_gsi_msi(0), Ok(vec![])); + + let mut allocator = ResourceAllocator::new(); + // We should be able to allocate 1 GSI + assert_eq!(allocator.allocate_gsi_msi(1), Ok(vec![arch::GSI_MSI_START])); + // We can't allocate MAX_IRQS any more + assert_eq!( + allocator.allocate_gsi_msi(GSI_MSI_NUM), + Err(vm_allocator::Error::ResourceNotAvailable) + ); + // We can allocate another one and it should be the second available + assert_eq!( + allocator.allocate_gsi_msi(1), + Ok(vec![arch::GSI_MSI_START + 1]) + ); + // Let's allocate the rest in a loop + for i in arch::GSI_MSI_START + 2..=arch::GSI_MSI_END { + assert_eq!(allocator.allocate_gsi_msi(1), Ok(vec![i])); + } + } + + fn clone_allocator(allocator: &ResourceAllocator) -> ResourceAllocator { + let mut buf = vec![0u8; 1024]; + Snapshot::serialize(&mut buf.as_mut_slice(), &allocator.save()).unwrap(); + let restored_state: ResourceAllocator = Snapshot::deserialize(&mut buf.as_slice()).unwrap(); + ResourceAllocator::restore((), &restored_state).unwrap() + } + + #[test] + fn test_save_restore() { + let mut allocator0 = ResourceAllocator::new(); + let irq_0 = allocator0.allocate_gsi_legacy(1).unwrap()[0]; + assert_eq!(irq_0, GSI_LEGACY_START); + let gsi_0 = allocator0.allocate_gsi_msi(1).unwrap()[0]; + assert_eq!(gsi_0, GSI_MSI_START); + + let mut allocator1 = clone_allocator(&allocator0); + let irq_1 = allocator1.allocate_gsi_legacy(1).unwrap()[0]; + assert_eq!(irq_1, GSI_LEGACY_START + 1); + let gsi_1 = allocator1.allocate_gsi_msi(1).unwrap()[0]; + assert_eq!(gsi_1, GSI_MSI_START + 1); + let mmio32_mem = allocator1 + .allocate_32bit_mmio_memory(0x42, 1, AllocPolicy::FirstMatch) + .unwrap(); + assert_eq!(mmio32_mem, arch::MEM_32BIT_DEVICES_START); + let mmio64_mem = allocator1 + .allocate_64bit_mmio_memory(0x42, 1, AllocPolicy::FirstMatch) + .unwrap(); + assert_eq!(mmio64_mem, arch::MEM_64BIT_DEVICES_START); + let system_mem = allocator1 + .allocate_system_memory(0x42, 1, AllocPolicy::FirstMatch) + .unwrap(); + assert_eq!(system_mem, arch::SYSTEM_MEM_START); + + let mut allocator2 = clone_allocator(&allocator1); + allocator2 + .allocate_32bit_mmio_memory(0x42, 1, AllocPolicy::ExactMatch(mmio32_mem)) + .unwrap_err(); + allocator2 + .allocate_64bit_mmio_memory(0x42, 1, AllocPolicy::ExactMatch(mmio64_mem)) + .unwrap_err(); + allocator2 + .allocate_system_memory(0x42, 1, AllocPolicy::ExactMatch(system_mem)) + .unwrap_err(); + + let irq_2 = allocator2.allocate_gsi_legacy(1).unwrap()[0]; + assert_eq!(irq_2, GSI_LEGACY_START + 2); + let gsi_2 = allocator2.allocate_gsi_msi(1).unwrap()[0]; + assert_eq!(gsi_2, GSI_MSI_START + 2); + let mmio32_mem = allocator1 + .allocate_32bit_mmio_memory(0x42, 1, AllocPolicy::FirstMatch) + .unwrap(); + assert_eq!(mmio32_mem, arch::MEM_32BIT_DEVICES_START + 0x42); + let mmio64_mem = allocator1 + .allocate_64bit_mmio_memory(0x42, 1, AllocPolicy::FirstMatch) + .unwrap(); + assert_eq!(mmio64_mem, arch::MEM_64BIT_DEVICES_START + 0x42); + let system_mem = allocator1 + .allocate_system_memory(0x42, 1, AllocPolicy::FirstMatch) + .unwrap(); + assert_eq!(system_mem, arch::SYSTEM_MEM_START + 0x42); + } +} diff --git a/src/vmm/src/vstate/vcpu.rs b/src/vmm/src/vstate/vcpu.rs index 8b6298079f3..642b2fd2352 100644 --- a/src/vmm/src/vstate/vcpu.rs +++ b/src/vmm/src/vstate/vcpu.rs @@ -174,7 +174,7 @@ impl Vcpu { } /// Sets a MMIO bus for this vcpu. - pub fn set_mmio_bus(&mut self, mmio_bus: crate::devices::Bus) { + pub fn set_mmio_bus(&mut self, mmio_bus: Arc) { self.kvm_vcpu.peripherals.mmio_bus = Some(mmio_bus); } @@ -481,7 +481,9 @@ fn handle_kvm_exit( VcpuExit::MmioRead(addr, data) => { if let Some(mmio_bus) = &peripherals.mmio_bus { let _metric = METRICS.vcpu.exit_mmio_read_agg.record_latency_metrics(); - mmio_bus.read(addr, data); + if let Err(err) = mmio_bus.read(addr, data) { + warn!("Invalid MMIO read @ {addr:#x}:{:#x}: {err}", data.len()); + } METRICS.vcpu.exit_mmio_read.inc(); } Ok(VcpuEmulation::Handled) @@ -489,7 +491,9 @@ fn handle_kvm_exit( VcpuExit::MmioWrite(addr, data) => { if let Some(mmio_bus) = &peripherals.mmio_bus { let _metric = METRICS.vcpu.exit_mmio_write_agg.record_latency_metrics(); - mmio_bus.write(addr, data); + if let Err(err) = mmio_bus.write(addr, data) { + warn!("Invalid MMIO read @ {addr:#x}:{:#x}: {err}", data.len()); + } METRICS.vcpu.exit_mmio_write.inc(); } Ok(VcpuEmulation::Handled) @@ -714,13 +718,12 @@ pub(crate) mod tests { use std::sync::{Arc, Barrier, Mutex}; use linux_loader::loader::KernelLoader; + use vm_device::BusDevice; use vmm_sys_util::errno; use super::*; use crate::RECV_TIMEOUT_SEC; use crate::arch::{BootProtocol, EntryPoint}; - use crate::devices::BusDevice; - use crate::devices::bus::DummyDevice; use crate::seccomp::get_empty_filters; use crate::utils::mib_to_bytes; use crate::utils::signal::validate_signal_num; @@ -730,6 +733,16 @@ pub(crate) mod tests { use crate::vstate::vm::Vm; use crate::vstate::vm::tests::setup_vm_with_memory; + struct DummyDevice; + + impl BusDevice for DummyDevice { + fn read(&mut self, _base: u64, _offset: u64, _data: &mut [u8]) {} + + fn write(&mut self, _base: u64, _offset: u64, _data: &[u8]) -> Option> { + None + } + } + #[test] fn test_handle_kvm_exit() { let (_, _, mut vcpu) = setup_vcpu(0x1000); @@ -824,8 +837,8 @@ pub(crate) mod tests { ) ); - let mut bus = crate::devices::Bus::new(); - let dummy = Arc::new(Mutex::new(BusDevice::Dummy(DummyDevice))); + let bus = Arc::new(vm_device::Bus::new()); + let dummy = Arc::new(Mutex::new(DummyDevice)); bus.insert(dummy, 0x10, 0x10).unwrap(); vcpu.set_mmio_bus(bus); let addr = 0x10; @@ -967,7 +980,7 @@ pub(crate) mod tests { fn test_set_mmio_bus() { let (_, _, mut vcpu) = setup_vcpu(0x1000); assert!(vcpu.kvm_vcpu.peripherals.mmio_bus.is_none()); - vcpu.set_mmio_bus(crate::devices::Bus::new()); + vcpu.set_mmio_bus(Arc::new(vm_device::Bus::new())); assert!(vcpu.kvm_vcpu.peripherals.mmio_bus.is_some()); } diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index f5a5755eec9..8c4049f9e0c 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -5,27 +5,236 @@ // Use of this source code is governed by a BSD-style license that can be // found in the THIRD-PARTY file. +use std::collections::HashMap; use std::fs::OpenOptions; use std::io::Write; use std::path::Path; -use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::{Arc, Mutex, MutexGuard}; -use kvm_bindings::{KVM_MEM_LOG_DIRTY_PAGES, kvm_userspace_memory_region}; +#[cfg(target_arch = "x86_64")] +use kvm_bindings::KVM_IRQCHIP_IOAPIC; +use kvm_bindings::{ + KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI, KVM_MEM_LOG_DIRTY_PAGES, KVM_MSI_VALID_DEVID, + KvmIrqRouting, kvm_irq_routing_entry, kvm_userspace_memory_region, +}; use kvm_ioctls::VmFd; +use log::{debug, error}; +use pci::DeviceRelocation; +use serde::{Deserialize, Serialize}; +use vm_device::interrupt::{ + InterruptIndex, InterruptSourceConfig, InterruptSourceGroup, MsiIrqSourceConfig, +}; +use vmm_sys_util::errno; use vmm_sys_util::eventfd::EventFd; -use crate::arch::host_page_size; pub use crate::arch::{ArchVm as Vm, ArchVmError, VmState}; +use crate::arch::{GSI_MSI_END, host_page_size}; use crate::logger::info; use crate::persist::CreateSnapshotError; +use crate::snapshot::Persist; use crate::utils::u64_to_usize; use crate::vmm_config::snapshot::SnapshotType; use crate::vstate::memory::{ Address, GuestMemory, GuestMemoryExtension, GuestMemoryMmap, GuestMemoryRegion, GuestRegionMmap, }; +use crate::vstate::resources::ResourceAllocator; use crate::vstate::vcpu::VcpuError; use crate::{DirtyBitmap, Vcpu, mem_size_mib}; +#[derive(Debug, thiserror::Error, displaydoc::Display)] +/// Errors related with Firecracker interrupts +pub enum InterruptError { + /// Error allocating resources: {0} + Allocator(#[from] vm_allocator::Error), + /// EventFd error: {0} + EventFd(std::io::Error), + /// FamStruct error: {0} + FamStruct(#[from] vmm_sys_util::fam::Error), + /// KVM error: {0} + Kvm(#[from] kvm_ioctls::Error), +} + +#[derive(Debug, Serialize, Deserialize)] +/// A struct representing an interrupt line used by some device of the microVM +pub struct RoutingEntry { + entry: kvm_irq_routing_entry, + masked: bool, +} + +/// Type that describes an allocated interrupt +#[derive(Debug)] +pub struct MsiVector { + /// GSI used for this vector + pub gsi: u32, + /// EventFd used for this vector + pub event_fd: EventFd, + /// Flag determining whether the vector is enabled + pub enabled: AtomicBool, +} + +impl MsiVector { + /// Create a new [`MsiVector`] of a particular type + pub fn new(gsi: u32, enabled: bool) -> Result { + Ok(MsiVector { + gsi, + event_fd: EventFd::new(libc::EFD_NONBLOCK).map_err(InterruptError::EventFd)?, + enabled: AtomicBool::new(enabled), + }) + } +} + +impl MsiVector { + /// Enable vector + fn enable(&self, vmfd: &VmFd) -> Result<(), errno::Error> { + if !self.enabled.load(Ordering::Acquire) { + vmfd.register_irqfd(&self.event_fd, self.gsi)?; + self.enabled.store(true, Ordering::Release); + } + + Ok(()) + } + + /// Disable vector + fn disable(&self, vmfd: &VmFd) -> Result<(), errno::Error> { + if self.enabled.load(Ordering::Acquire) { + vmfd.unregister_irqfd(&self.event_fd, self.gsi)?; + self.enabled.store(false, Ordering::Release); + } + + Ok(()) + } +} + +#[derive(Debug)] +/// MSI interrupts created for a VirtIO device +pub struct MsiVectorGroup { + vm: Arc, + irq_routes: Vec, +} + +impl MsiVectorGroup { + /// Returns the number of vectors in this group + pub fn num_vectors(&self) -> u16 { + // It is safe to unwrap here. We are creating `MsiVectorGroup` objects through the + // `Vm::create_msix_group` where the argument for the number of `irq_routes` is a `u16`. + u16::try_from(self.irq_routes.len()).unwrap() + } +} + +impl<'a> Persist<'a> for MsiVectorGroup { + type State = Vec; + type ConstructorArgs = Arc; + type Error = InterruptError; + + fn save(&self) -> Self::State { + // We don't save the "enabled" state of the MSI interrupt. PCI devices store the MSI-X + // configuration and make sure that the vector is enabled during the restore path if it was + // initially enabled + self.irq_routes.iter().map(|route| route.gsi).collect() + } + + fn restore( + constructor_args: Self::ConstructorArgs, + state: &Self::State, + ) -> std::result::Result { + let mut irq_routes = Vec::with_capacity(state.len()); + + for gsi in state { + irq_routes.push(MsiVector::new(*gsi, false)?); + } + + Ok(MsiVectorGroup { + vm: constructor_args, + irq_routes, + }) + } +} + +impl InterruptSourceGroup for MsiVectorGroup { + fn enable(&self) -> vm_device::interrupt::Result<()> { + for route in &self.irq_routes { + route.enable(&self.vm.common.fd)?; + } + + Ok(()) + } + + fn disable(&self) -> vm_device::interrupt::Result<()> { + for route in &self.irq_routes { + route.disable(&self.vm.common.fd)?; + } + + Ok(()) + } + + fn trigger(&self, index: InterruptIndex) -> vm_device::interrupt::Result<()> { + self.notifier(index) + .ok_or_else(|| { + std::io::Error::other(format!("trigger: invalid interrupt index {index}")) + })? + .write(1) + } + + fn notifier(&self, index: InterruptIndex) -> Option<&EventFd> { + self.irq_routes + .get(index as usize) + .map(|route| &route.event_fd) + } + + fn update( + &self, + index: InterruptIndex, + config: InterruptSourceConfig, + masked: bool, + set_gsi: bool, + ) -> vm_device::interrupt::Result<()> { + let msi_config = match config { + InterruptSourceConfig::LegacyIrq(_) => { + return Err(std::io::Error::other( + "MSI-x update: invalid configuration type", + )); + } + InterruptSourceConfig::MsiIrq(config) => config, + }; + + if let Some(route) = self.irq_routes.get(index as usize) { + // When an interrupt is masked the GSI will not be passed to KVM through + // KVM_SET_GSI_ROUTING. So, call [`disable()`] to unregister the interrupt file + // descriptor before passing the interrupt routes to KVM + if masked { + route.disable(&self.vm.common.fd)?; + } + + self.vm.register_msi(route, masked, msi_config)?; + if set_gsi { + self.vm + .set_gsi_routes() + .map_err(|err| std::io::Error::other(format!("MSI-X update: {err}")))? + } + + // Assign KVM_IRQFD after KVM_SET_GSI_ROUTING to avoid + // panic on kernel which does not have commit a80ced6ea514 + // (KVM: SVM: fix panic on out-of-bounds guest IRQ). + if !masked { + route.enable(&self.vm.common.fd)?; + } + + return Ok(()); + } + + Err(std::io::Error::other(format!( + "MSI-X update: invalid vector index {index}" + ))) + } + + fn set_gsi(&self) -> vm_device::interrupt::Result<()> { + self.vm + .set_gsi_routes() + .map_err(|err| std::io::Error::other(format!("MSI-X update: {err}"))) + } +} + /// Architecture independent parts of a VM. #[derive(Debug)] pub struct VmCommon { @@ -34,6 +243,12 @@ pub struct VmCommon { max_memslots: u32, /// The guest memory of this Vm. pub guest_memory: GuestMemoryMmap, + /// Interrupts used by Vm's devices + pub interrupts: Mutex>, + /// Allocator for VM resources + pub resource_allocator: Mutex, + /// MMIO bus + pub mmio_bus: Arc, } /// Errors associated with the wrappers over KVM ioctls. @@ -59,6 +274,8 @@ pub enum VmError { VmMemory(#[from] vm_memory::Error), /// Error calling mincore: {0} Mincore(vmm_sys_util::errno::Error), + /// ResourceAllocator error: {0} + ResourceAllocator(#[from] vm_allocator::Error) } /// Contains Vm functions that are usable across CPU architectures @@ -105,6 +322,9 @@ impl Vm { fd, max_memslots: kvm.max_nr_memslots(), guest_memory: GuestMemoryMmap::default(), + interrupts: Mutex::new(HashMap::with_capacity(GSI_MSI_END as usize + 1)), + resource_allocator: Mutex::new(ResourceAllocator::new()), + mmio_bus: Arc::new(vm_device::Bus::new()), }) } @@ -189,6 +409,14 @@ impl Vm { &self.common.guest_memory } + /// Gets a mutable reference to this [`Vm`]'s [`ResourceAllocator`] object + pub fn resource_allocator(&self) -> MutexGuard { + self.common + .resource_allocator + .lock() + .expect("Poisoned lock") + } + /// Resets the KVM dirty bitmap for each of the guest's memory regions. pub fn reset_dirty_bitmap(&self) { self.guest_memory() @@ -284,6 +512,113 @@ impl Vm { file.sync_all() .map_err(|err| MemoryBackingFile("sync_all", err)) } + + /// Register a device IRQ + pub fn register_irq(&self, fd: &EventFd, gsi: u32) -> Result<(), errno::Error> { + self.common.fd.register_irqfd(fd, gsi)?; + + let mut entry = kvm_irq_routing_entry { + gsi, + type_: KVM_IRQ_ROUTING_IRQCHIP, + ..Default::default() + }; + #[cfg(target_arch = "x86_64")] + { + entry.u.irqchip.irqchip = KVM_IRQCHIP_IOAPIC; + } + #[cfg(target_arch = "aarch64")] + { + entry.u.irqchip.irqchip = 0; + } + entry.u.irqchip.pin = gsi; + + self.common + .interrupts + .lock() + .expect("Poisoned lock") + .insert( + gsi, + RoutingEntry { + entry, + masked: false, + }, + ); + Ok(()) + } + + /// Register an MSI device interrupt + pub fn register_msi( + &self, + route: &MsiVector, + masked: bool, + config: MsiIrqSourceConfig, + ) -> Result<(), errno::Error> { + let mut entry = kvm_irq_routing_entry { + gsi: route.gsi, + type_: KVM_IRQ_ROUTING_MSI, + ..Default::default() + }; + entry.u.msi.address_lo = config.low_addr; + entry.u.msi.address_hi = config.high_addr; + entry.u.msi.data = config.data; + + if self.common.fd.check_extension(kvm_ioctls::Cap::MsiDevid) { + // According to KVM documentation: + // https://docs.kernel.org/virt/kvm/api.html#kvm-set-gsi-routing + // + // if the capability is set, we need to set the flag and provide a valid unique device + // ID. "For PCI, this is usually a BDF identifier in the lower 16 bits". + // + // The layout of `config.devid` is: + // + // |---- 16 bits ----|-- 8 bits --|-- 5 bits --|-- 3 bits --| + // | segment | bus | device | function | + // + // For the time being, we are using a single PCI segment and a single bus per segment + // so just passing config.devid should be fine. + entry.flags = KVM_MSI_VALID_DEVID; + entry.u.msi.__bindgen_anon_1.devid = config.devid; + } + + self.common + .interrupts + .lock() + .expect("Poisoned lock") + .insert(route.gsi, RoutingEntry { entry, masked }); + + Ok(()) + } + + /// Create a group of MSI-X interrupts + pub fn create_msix_group(vm: Arc, count: u16) -> Result { + debug!("Creating new MSI group with {count} vectors"); + let mut irq_routes = Vec::with_capacity(count as usize); + for gsi in vm + .resource_allocator() + .allocate_gsi_msi(count as u32)? + .iter() + { + irq_routes.push(MsiVector::new(*gsi, false)?); + } + + Ok(MsiVectorGroup { vm, irq_routes }) + } + + /// Set GSI routes to KVM + pub fn set_gsi_routes(&self) -> Result<(), InterruptError> { + let entries = self.common.interrupts.lock().expect("Poisoned lock"); + let mut routes = KvmIrqRouting::new(0)?; + + for entry in entries.values() { + if entry.masked { + continue; + } + routes.push(entry.entry)?; + } + + self.common.fd.set_gsi_routing(&routes)?; + Ok(()) + } } /// Use `mincore(2)` to overapproximate the dirty bitmap for the given memslot. To be used @@ -327,12 +662,29 @@ fn mincore_bitmap(region: &GuestRegionMmap) -> Result, VmError> { Ok(bitmap) } +impl DeviceRelocation for Vm { + fn move_bar( + &self, + _old_base: u64, + _new_base: u64, + _len: u64, + _pci_dev: &mut dyn pci::PciDevice, + _region_type: pci::PciBarRegionType, + ) -> Result<(), std::io::Error> { + error!("pci: device relocation not supported"); + Err(std::io::Error::from(std::io::ErrorKind::Unsupported)) + } +} + #[cfg(test)] pub(crate) mod tests { + use vm_device::interrupt::{InterruptSourceConfig, LegacyIrqSourceConfig}; use vm_memory::GuestAddress; use vm_memory::mmap::MmapRegionBuilder; use super::*; + #[cfg(target_arch = "x86_64")] + use crate::snapshot::Snapshot; use crate::test_utils::single_region_mem_raw; use crate::utils::mib_to_bytes; use crate::vstate::kvm::Kvm; @@ -439,4 +791,274 @@ pub(crate) mod tests { assert_eq!(vcpu_vec.len(), vcpu_count as usize); } + + fn enable_irqchip(vm: &mut Vm) { + #[cfg(target_arch = "x86_64")] + vm.setup_irqchip().unwrap(); + #[cfg(target_arch = "aarch64")] + vm.setup_irqchip(1).unwrap(); + } + + fn create_msix_group(vm: &Arc) -> MsiVectorGroup { + Vm::create_msix_group(vm.clone(), 4).unwrap() + } + + #[test] + fn test_msi_vector_group_new() { + let (_, vm) = setup_vm_with_memory(mib_to_bytes(128)); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + assert_eq!(msix_group.num_vectors(), 4); + } + + #[test] + fn test_msi_vector_group_enable_disable() { + let (_, mut vm) = setup_vm_with_memory(mib_to_bytes(128)); + enable_irqchip(&mut vm); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + + // Initially all vectors are disabled + for route in &msix_group.irq_routes { + assert!(!route.enabled.load(Ordering::Acquire)) + } + + // Enable works + msix_group.enable().unwrap(); + for route in &msix_group.irq_routes { + assert!(route.enabled.load(Ordering::Acquire)); + } + // Enabling an enabled group doesn't error out + msix_group.enable().unwrap(); + + // Disable works + msix_group.disable().unwrap(); + for route in &msix_group.irq_routes { + assert!(!route.enabled.load(Ordering::Acquire)) + } + // Disabling a disabled group doesn't error out + } + + #[test] + fn test_msi_vector_group_trigger() { + let (_, mut vm) = setup_vm_with_memory(mib_to_bytes(128)); + enable_irqchip(&mut vm); + + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + + // We can now trigger all vectors + for i in 0..4 { + msix_group.trigger(i).unwrap() + } + + // We can't trigger an invalid vector + msix_group.trigger(4).unwrap_err(); + } + + #[test] + fn test_msi_vector_group_notifier() { + let (_, vm) = setup_vm_with_memory(mib_to_bytes(128)); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + + for i in 0..4 { + assert!(msix_group.notifier(i).is_some()); + } + + assert!(msix_group.notifier(4).is_none()); + } + + #[test] + fn test_msi_vector_group_update_wrong_config() { + let (_, vm) = setup_vm_with_memory(mib_to_bytes(128)); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + let irq_config = LegacyIrqSourceConfig { irqchip: 0, pin: 0 }; + msix_group + .update(0, InterruptSourceConfig::LegacyIrq(irq_config), true, true) + .unwrap_err(); + } + + #[test] + fn test_msi_vector_group_update_invalid_vector() { + let (_, mut vm) = setup_vm_with_memory(mib_to_bytes(128)); + enable_irqchip(&mut vm); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + let config = InterruptSourceConfig::MsiIrq(MsiIrqSourceConfig { + high_addr: 0x42, + low_addr: 0x12, + data: 0x12, + devid: 0xafa, + }); + msix_group.update(0, config, true, true).unwrap(); + msix_group.update(4, config, true, true).unwrap_err(); + } + + #[test] + fn test_msi_vector_group_update() { + let (_, mut vm) = setup_vm_with_memory(mib_to_bytes(128)); + enable_irqchip(&mut vm); + let vm = Arc::new(vm); + assert!(vm.common.interrupts.lock().unwrap().is_empty()); + let msix_group = create_msix_group(&vm); + + // Set some configuration for the vectors. Initially all are masked + let mut config = MsiIrqSourceConfig { + high_addr: 0x42, + low_addr: 0x13, + data: 0x12, + devid: 0xafa, + }; + for i in 0..4 { + config.data = 0x12 * i; + msix_group + .update(i, InterruptSourceConfig::MsiIrq(config), true, false) + .unwrap(); + } + + // All vectors should be disabled + for vector in &msix_group.irq_routes { + assert!(!vector.enabled.load(Ordering::Acquire)); + } + + for i in 0..4 { + let gsi = crate::arch::GSI_MSI_START + i; + let interrupts = vm.common.interrupts.lock().unwrap(); + let kvm_route = interrupts.get(&gsi).unwrap(); + assert!(kvm_route.masked); + assert_eq!(kvm_route.entry.gsi, gsi); + assert_eq!(kvm_route.entry.type_, KVM_IRQ_ROUTING_MSI); + // SAFETY: because we know we setup MSI routes. + unsafe { + assert_eq!(kvm_route.entry.u.msi.address_hi, 0x42); + assert_eq!(kvm_route.entry.u.msi.address_lo, 0x13); + assert_eq!(kvm_route.entry.u.msi.data, 0x12 * i); + } + } + + // Simply enabling the vectors should not update the registered IRQ routes + msix_group.enable().unwrap(); + for i in 0..4 { + let gsi = crate::arch::GSI_MSI_START + i; + let interrupts = vm.common.interrupts.lock().unwrap(); + let kvm_route = interrupts.get(&gsi).unwrap(); + assert!(kvm_route.masked); + assert_eq!(kvm_route.entry.gsi, gsi); + assert_eq!(kvm_route.entry.type_, KVM_IRQ_ROUTING_MSI); + // SAFETY: because we know we setup MSI routes. + unsafe { + assert_eq!(kvm_route.entry.u.msi.address_hi, 0x42); + assert_eq!(kvm_route.entry.u.msi.address_lo, 0x13); + assert_eq!(kvm_route.entry.u.msi.data, 0x12 * i); + } + } + + // Updating the config of a vector should enable its route (and only its route) + config.data = 0; + msix_group + .update(0, InterruptSourceConfig::MsiIrq(config), false, true) + .unwrap(); + for i in 0..4 { + let gsi = crate::arch::GSI_MSI_START + i; + let interrupts = vm.common.interrupts.lock().unwrap(); + let kvm_route = interrupts.get(&gsi).unwrap(); + assert_eq!(kvm_route.masked, i != 0); + assert_eq!(kvm_route.entry.gsi, gsi); + assert_eq!(kvm_route.entry.type_, KVM_IRQ_ROUTING_MSI); + // SAFETY: because we know we setup MSI routes. + unsafe { + assert_eq!(kvm_route.entry.u.msi.address_hi, 0x42); + assert_eq!(kvm_route.entry.u.msi.address_lo, 0x13); + assert_eq!(kvm_route.entry.u.msi.data, 0x12 * i); + } + } + } + + #[cfg(target_arch = "x86_64")] + #[test] + fn test_msi_vector_group_set_gsi_without_ioapic() { + // Setting GSI routes without IOAPIC setup should fail on x86. Apparently, it doesn't fail + // on Aarch64 + let (_, vm) = setup_vm_with_memory(mib_to_bytes(128)); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + let err = msix_group.set_gsi().unwrap_err(); + assert_eq!( + format!("{err}"), + "MSI-X update: KVM error: Invalid argument (os error 22)" + ); + } + + #[test] + fn test_msi_vector_group_set_gsi() { + let (_, mut vm) = setup_vm_with_memory(mib_to_bytes(128)); + enable_irqchip(&mut vm); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + + msix_group.set_gsi().unwrap(); + } + + #[test] + fn test_msi_vector_group_persistence() { + let (_, mut vm) = setup_vm_with_memory(mib_to_bytes(128)); + enable_irqchip(&mut vm); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + + msix_group.enable().unwrap(); + let state = msix_group.save(); + let restored_group = MsiVectorGroup::restore(vm, &state).unwrap(); + + assert_eq!(msix_group.num_vectors(), restored_group.num_vectors()); + // Even if an MSI group is enabled, we don't save it as such. During restoration, the PCI + // transport will make sure the correct config is set for the vectors and enable them + // accordingly. + for (id, vector) in msix_group.irq_routes.iter().enumerate() { + let new_vector = &restored_group.irq_routes[id]; + assert_eq!(vector.gsi, new_vector.gsi); + assert!(!new_vector.enabled.load(Ordering::Acquire)); + } + } + + #[cfg(target_arch = "x86_64")] + #[test] + fn test_restore_state_resource_allocator() { + use vm_allocator::AllocPolicy; + + let mut snapshot_data = vec![0u8; 10000]; + let (_, mut vm) = setup_vm_with_memory(0x1000); + vm.setup_irqchip().unwrap(); + + // Allocate a GSI and some memory and make sure they are still allocated after restore + let (gsi, range) = { + let mut resource_allocator = vm.resource_allocator(); + + let gsi = resource_allocator.allocate_gsi_msi(1).unwrap()[0]; + let range = resource_allocator + .allocate_32bit_mmio_memory(1024, 1024, AllocPolicy::FirstMatch) + .unwrap(); + (gsi, range) + }; + + let state = vm.save_state().unwrap(); + Snapshot::serialize(&mut snapshot_data.as_mut_slice(), &state).unwrap(); + + let restored_state: VmState = Snapshot::deserialize(&mut snapshot_data.as_slice()).unwrap(); + vm.restore_state(&restored_state).unwrap(); + + let mut resource_allocator = vm.resource_allocator(); + let gsi_new = resource_allocator.allocate_gsi_msi(1).unwrap()[0]; + assert_eq!(gsi + 1, gsi_new); + + resource_allocator + .allocate_32bit_mmio_memory(1024, 1024, AllocPolicy::ExactMatch(range)) + .unwrap_err(); + let range_new = resource_allocator + .allocate_32bit_mmio_memory(1024, 1024, AllocPolicy::FirstMatch) + .unwrap(); + assert_eq!(range + 1024, range_new); + } } diff --git a/src/vmm/tests/devices.rs b/src/vmm/tests/devices.rs index 62dd4d30aa7..a1ddf124cf7 100644 --- a/src/vmm/tests/devices.rs +++ b/src/vmm/tests/devices.rs @@ -12,6 +12,7 @@ use std::sync::{Arc, Mutex}; use event_manager::{EventManager, SubscriberOps}; use libc::EFD_NONBLOCK; +use vm_device::BusDevice; use vm_superio::Serial; use vmm::devices::legacy::serial::SerialOut; use vmm::devices::legacy::{EventFdTrigger, SerialEventsWrapper, SerialWrapper}; @@ -95,7 +96,7 @@ fn test_issue_serial_hangup_anon_pipe_while_registered_stdin() { serial .lock() .unwrap() - .bus_read(data_bus_offset, &mut data[i..=i]); + .read(0x0, data_bus_offset, &mut data[i..=i]); } assert!(data[..31] == dummy_data[..31]); @@ -142,7 +143,7 @@ fn test_issue_serial_hangup_anon_pipe_while_registered_stdin() { serial .lock() .unwrap() - .bus_read(data_bus_offset, &mut data[i..=i]); + .read(0x0, data_bus_offset, &mut data[i..=i]); } // Process the kick stdin event generated by the reading of the 64th byte of the serial FIFO. @@ -156,7 +157,7 @@ fn test_issue_serial_hangup_anon_pipe_while_registered_stdin() { serial .lock() .unwrap() - .bus_read(data_bus_offset, &mut data[i..=i]); + .read(0x0, data_bus_offset, &mut data[i..=i]); } // We try to read again, but we detect that stdin received previously EOF. @@ -243,7 +244,7 @@ fn test_issue_serial_hangup_anon_pipe_while_unregistered_stdin() { serial .lock() .unwrap() - .bus_read(data_bus_offset, &mut data[i..=i]); + .read(0x0, data_bus_offset, &mut data[i..=i]); } assert!(data[..31] == dummy_data[..31]); @@ -293,7 +294,7 @@ fn test_issue_serial_hangup_anon_pipe_while_unregistered_stdin() { serial .lock() .unwrap() - .bus_read(data_bus_offset, &mut data[i..=i]); + .read(0x0, data_bus_offset, &mut data[i..=i]); } // Process the kick stdin event generated by the reading of the 64th byte of the serial FIFO. @@ -309,7 +310,7 @@ fn test_issue_serial_hangup_anon_pipe_while_unregistered_stdin() { serial .lock() .unwrap() - .bus_read(data_bus_offset, &mut data[i..=i]); + .read(0x0, data_bus_offset, &mut data[i..=i]); } // We try to read again, but we detect that stdin received previously EOF. diff --git a/src/vmm/tests/integration_tests.rs b/src/vmm/tests/integration_tests.rs index 7ef68468709..4dd993d7c90 100644 --- a/src/vmm/tests/integration_tests.rs +++ b/src/vmm/tests/integration_tests.rs @@ -4,6 +4,7 @@ #![allow(clippy::cast_possible_truncation, clippy::tests_outside_test_module)] use std::io::{Seek, SeekFrom}; +use std::sync::{Arc, Mutex}; use std::thread; use std::time::Duration; @@ -17,7 +18,9 @@ use vmm::rpc_interface::{ use vmm::seccomp::get_empty_filters; use vmm::snapshot::Snapshot; use vmm::test_utils::mock_resources::{MockVmResources, NOISY_KERNEL_IMAGE}; -use vmm::test_utils::{create_vmm, default_vmm, default_vmm_no_boot}; +use vmm::test_utils::{ + create_vmm, default_vmm, default_vmm_no_boot, default_vmm_pci, default_vmm_pci_no_boot, +}; use vmm::vmm_config::balloon::BalloonDeviceConfig; use vmm::vmm_config::boot_source::BootSourceConfig; use vmm::vmm_config::drive::BlockDeviceConfig; @@ -28,9 +31,24 @@ use vmm::vmm_config::snapshot::{ CreateSnapshotParams, LoadSnapshotParams, MemBackendConfig, MemBackendType, SnapshotType, }; use vmm::vmm_config::vsock::VsockDeviceConfig; -use vmm::{DumpCpuConfigError, EventManager, FcExitCode}; +use vmm::{DumpCpuConfigError, EventManager, FcExitCode, Vmm}; use vmm_sys_util::tempfile::TempFile; +#[allow(unused_mut, unused_variables)] +fn check_booted_microvm(vmm: Arc>, mut evmgr: EventManager) { + // On x86_64, the vmm should exit once its workload completes and signals the exit event. + // On aarch64, the test kernel doesn't exit, so the vmm is force-stopped. + #[cfg(target_arch = "x86_64")] + evmgr.run_with_timeout(500).unwrap(); + #[cfg(target_arch = "aarch64")] + vmm.lock().unwrap().stop(FcExitCode::Ok); + + assert_eq!( + vmm.lock().unwrap().shutdown_exit_code(), + Some(FcExitCode::Ok) + ); +} + #[test] fn test_build_and_boot_microvm() { // Error case: no boot source configured. @@ -49,25 +67,17 @@ fn test_build_and_boot_microvm() { } // Success case. - let (vmm, mut _evmgr) = default_vmm(None); + let (vmm, evmgr) = default_vmm(None); + check_booted_microvm(vmm, evmgr); - // On x86_64, the vmm should exit once its workload completes and signals the exit event. - // On aarch64, the test kernel doesn't exit, so the vmm is force-stopped. - #[cfg(target_arch = "x86_64")] - _evmgr.run_with_timeout(500).unwrap(); - #[cfg(target_arch = "aarch64")] - vmm.lock().unwrap().stop(FcExitCode::Ok); - - assert_eq!( - vmm.lock().unwrap().shutdown_exit_code(), - Some(FcExitCode::Ok) - ); + // microVM with PCI + let (vmm, evmgr) = default_vmm_pci(None); + check_booted_microvm(vmm, evmgr); } -#[test] -fn test_build_microvm() { +#[allow(unused_mut, unused_variables)] +fn check_build_microvm(vmm: Arc>, mut evmgr: EventManager) { // The built microVM should be in the `VmState::Paused` state here. - let (vmm, mut _evtmgr) = default_vmm_no_boot(None); assert_eq!(vmm.lock().unwrap().instance_info().state, VmState::Paused); // The microVM should be able to resume and exit successfully. @@ -75,7 +85,7 @@ fn test_build_microvm() { // On aarch64, the test kernel doesn't exit, so the vmm is force-stopped. vmm.lock().unwrap().resume_vm().unwrap(); #[cfg(target_arch = "x86_64")] - _evtmgr.run_with_timeout(500).unwrap(); + evmgr.run_with_timeout(500).unwrap(); #[cfg(target_arch = "aarch64")] vmm.lock().unwrap().stop(FcExitCode::Ok); assert_eq!( @@ -85,10 +95,14 @@ fn test_build_microvm() { } #[test] -fn test_pause_resume_microvm() { - // Tests that pausing and resuming a microVM work as expected. - let (vmm, _) = default_vmm(None); +fn test_build_microvm() { + let (vmm, evtmgr) = default_vmm_no_boot(None); + check_build_microvm(vmm, evtmgr); + let (vmm, evtmgr) = default_vmm_pci_no_boot(None); + check_build_microvm(vmm, evtmgr); +} +fn pause_resume_microvm(vmm: Arc>) { let mut api_controller = RuntimeApiController::new(VmResources::default(), vmm.clone()); // There's a race between this thread and the vcpu thread, but this thread @@ -102,6 +116,17 @@ fn test_pause_resume_microvm() { vmm.lock().unwrap().stop(FcExitCode::Ok); } +#[test] +fn test_pause_resume_microvm() { + // Tests that pausing and resuming a microVM work as expected. + let (vmm, _) = default_vmm(None); + + pause_resume_microvm(vmm); + + let (vmm, _) = default_vmm_pci(None); + pause_resume_microvm(vmm); +} + #[test] #[cfg(target_arch = "x86_64")] fn test_dirty_bitmap_success() { @@ -170,11 +195,11 @@ fn test_disallow_dump_cpu_config_without_pausing() { vmm.lock().unwrap().stop(FcExitCode::Ok); } -fn verify_create_snapshot(is_diff: bool) -> (TempFile, TempFile) { +fn verify_create_snapshot(is_diff: bool, pci_enabled: bool) -> (TempFile, TempFile) { let snapshot_file = TempFile::new().unwrap(); let memory_file = TempFile::new().unwrap(); - let (vmm, _) = create_vmm(Some(NOISY_KERNEL_IMAGE), is_diff, true); + let (vmm, _) = create_vmm(Some(NOISY_KERNEL_IMAGE), is_diff, true, pci_enabled); let resources = VmResources { machine_config: MachineConfig { mem_size_mib: 1, @@ -212,7 +237,7 @@ fn verify_create_snapshot(is_diff: bool) -> (TempFile, TempFile) { // Check that we can deserialize the microVM state from `snapshot_file`. let snapshot_path = snapshot_file.as_path().to_path_buf(); let snapshot_file_metadata = std::fs::metadata(snapshot_path).unwrap(); - let snapshot_len = snapshot_file_metadata.len() as usize; + let snapshot_len = snapshot_file_metadata.len().try_into().unwrap(); let (restored_microvm_state, _) = Snapshot::load::<_, MicrovmState>(&mut snapshot_file.as_file(), snapshot_len).unwrap(); @@ -220,9 +245,29 @@ fn verify_create_snapshot(is_diff: bool) -> (TempFile, TempFile) { // Verify deserialized data. // The default vmm has no devices and one vCPU. - assert_eq!(restored_microvm_state.device_states.block_devices.len(), 0); - assert_eq!(restored_microvm_state.device_states.net_devices.len(), 0); - assert!(restored_microvm_state.device_states.vsock_device.is_none()); + assert_eq!( + restored_microvm_state + .device_states + .mmio_state + .block_devices + .len(), + 0 + ); + assert_eq!( + restored_microvm_state + .device_states + .mmio_state + .net_devices + .len(), + 0 + ); + assert!( + restored_microvm_state + .device_states + .mmio_state + .vsock_device + .is_none() + ); assert_eq!(restored_microvm_state.vcpu_states.len(), 1); (snapshot_file, memory_file) @@ -261,29 +306,27 @@ fn verify_load_snapshot(snapshot_file: TempFile, memory_file: TempFile) { #[test] fn test_create_and_load_snapshot() { - // Create diff snapshot. - let (snapshot_file, memory_file) = verify_create_snapshot(true); - // Create a new microVm from snapshot. This only tests code-level logic; it verifies - // that a microVM can be built with no errors from given snapshot. - // It does _not_ verify that the guest is actually restored properly. We're using - // python integration tests for that. - verify_load_snapshot(snapshot_file, memory_file); - - // Create full snapshot. - let (snapshot_file, memory_file) = verify_create_snapshot(false); - // Create a new microVm from snapshot. This only tests code-level logic; it verifies - // that a microVM can be built with no errors from given snapshot. - // It does _not_ verify that the guest is actually restored properly. We're using - // python integration tests for that. - verify_load_snapshot(snapshot_file, memory_file); + for (diff_snap, pci_enabled) in [(false, false), (false, true), (true, false), (true, true)] { + // Create snapshot. + let (snapshot_file, memory_file) = verify_create_snapshot(diff_snap, pci_enabled); + // Create a new microVm from snapshot. This only tests code-level logic; it verifies + // that a microVM can be built with no errors from given snapshot. + // It does _not_ verify that the guest is actually restored properly. We're using + // python integration tests for that. + verify_load_snapshot(snapshot_file, memory_file); + } } #[test] fn test_snapshot_load_sanity_checks() { - use vmm::persist::SnapShotStateSanityCheckError; - - let mut microvm_state = get_microvm_state_from_snapshot(); + let microvm_state = get_microvm_state_from_snapshot(false); + check_snapshot(microvm_state); + let microvm_state = get_microvm_state_from_snapshot(true); + check_snapshot(microvm_state); +} +fn check_snapshot(mut microvm_state: MicrovmState) { + use vmm::persist::SnapShotStateSanityCheckError; snapshot_state_sanity_check(µvm_state).unwrap(); // Remove memory regions. @@ -296,9 +339,9 @@ fn test_snapshot_load_sanity_checks() { ); } -fn get_microvm_state_from_snapshot() -> MicrovmState { +fn get_microvm_state_from_snapshot(pci_enabled: bool) -> MicrovmState { // Create a diff snapshot - let (snapshot_file, _) = verify_create_snapshot(true); + let (snapshot_file, _) = verify_create_snapshot(true, pci_enabled); // Deserialize the microVM state. let snapshot_file_metadata = snapshot_file.as_file().metadata().unwrap(); @@ -309,7 +352,7 @@ fn get_microvm_state_from_snapshot() -> MicrovmState { } fn verify_load_snap_disallowed_after_boot_resources(res: VmmAction, res_name: &str) { - let (snapshot_file, memory_file) = verify_create_snapshot(false); + let (snapshot_file, memory_file) = verify_create_snapshot(false, false); let mut event_manager = EventManager::new().unwrap(); let empty_seccomp_filters = get_empty_filters(); diff --git a/tests/conftest.py b/tests/conftest.py index 4482a685155..96ee285d192 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -509,9 +509,21 @@ def rootfs_rw(): @pytest.fixture -def uvm_plain(microvm_factory, guest_kernel_linux_5_10, rootfs): +def uvm_plain(microvm_factory, guest_kernel_linux_5_10, rootfs, pci_enabled): """Create a vanilla VM, non-parametrized""" - return microvm_factory.build(guest_kernel_linux_5_10, rootfs) + return microvm_factory.build(guest_kernel_linux_5_10, rootfs, pci=pci_enabled) + + +@pytest.fixture +def uvm_plain_6_1(microvm_factory, guest_kernel_linux_6_1, rootfs, pci_enabled): + """Create a vanilla VM, non-parametrized""" + return microvm_factory.build(guest_kernel_linux_6_1, rootfs, pci=pci_enabled) + + +@pytest.fixture +def uvm_plain_acpi(microvm_factory, guest_kernel_acpi, rootfs, pci_enabled): + """Create a vanilla VM, non-parametrized""" + return microvm_factory.build(guest_kernel_acpi, rootfs, pci=pci_enabled) @pytest.fixture @@ -537,12 +549,12 @@ def artifact_dir(): @pytest.fixture -def uvm_plain_any(microvm_factory, guest_kernel, rootfs): +def uvm_plain_any(microvm_factory, guest_kernel, rootfs, pci_enabled): """All guest kernels kernel: all rootfs: Ubuntu 24.04 """ - return microvm_factory.build(guest_kernel, rootfs) + return microvm_factory.build(guest_kernel, rootfs, pci=pci_enabled) guest_kernel_6_1_debug = pytest.fixture( @@ -569,11 +581,23 @@ def mem_size_mib(): return 256 +@pytest.fixture(params=[True, False]) +def pci_enabled(request): + """Fixture that allows configuring whether a microVM will have PCI enabled or not""" + yield request.param + + def uvm_booted( - microvm_factory, guest_kernel, rootfs, cpu_template, vcpu_count=2, mem_size_mib=256 + microvm_factory, + guest_kernel, + rootfs, + cpu_template, + pci_enabled, + vcpu_count=2, + mem_size_mib=256, ): """Return a booted uvm""" - uvm = microvm_factory.build(guest_kernel, rootfs) + uvm = microvm_factory.build(guest_kernel, rootfs, pci=pci_enabled) uvm.spawn() uvm.basic_config(vcpu_count=vcpu_count, mem_size_mib=mem_size_mib) uvm.set_cpu_template(cpu_template) @@ -582,9 +606,13 @@ def uvm_booted( return uvm -def uvm_restored(microvm_factory, guest_kernel, rootfs, cpu_template, **kwargs): +def uvm_restored( + microvm_factory, guest_kernel, rootfs, cpu_template, pci_enabled, **kwargs +): """Return a restored uvm""" - uvm = uvm_booted(microvm_factory, guest_kernel, rootfs, cpu_template, **kwargs) + uvm = uvm_booted( + microvm_factory, guest_kernel, rootfs, cpu_template, pci_enabled, **kwargs + ) snapshot = uvm.snapshot_full() uvm.kill() uvm2 = microvm_factory.build_from_snapshot(snapshot) @@ -605,6 +633,7 @@ def uvm_any( guest_kernel, rootfs, cpu_template_any, + pci_enabled, vcpu_count, mem_size_mib, ): @@ -614,6 +643,7 @@ def uvm_any( guest_kernel, rootfs, cpu_template_any, + pci_enabled, vcpu_count=vcpu_count, mem_size_mib=mem_size_mib, ) @@ -621,7 +651,13 @@ def uvm_any( @pytest.fixture def uvm_any_booted( - microvm_factory, guest_kernel, rootfs, cpu_template_any, vcpu_count, mem_size_mib + microvm_factory, + guest_kernel, + rootfs, + cpu_template_any, + pci_enabled, + vcpu_count, + mem_size_mib, ): """Return booted uvms""" return uvm_booted( @@ -629,6 +665,51 @@ def uvm_any_booted( guest_kernel, rootfs, cpu_template_any, + pci_enabled, + vcpu_count=vcpu_count, + mem_size_mib=mem_size_mib, + ) + + +@pytest.fixture +def uvm_any_with_pci( + uvm_ctor, + microvm_factory, + guest_kernel_acpi, + rootfs, + cpu_template_any, + vcpu_count, + mem_size_mib, +): + """Return booted uvms with PCI enabled""" + return uvm_ctor( + microvm_factory, + guest_kernel_acpi, + rootfs, + cpu_template_any, + True, + vcpu_count=vcpu_count, + mem_size_mib=mem_size_mib, + ) + + +@pytest.fixture +def uvm_any_without_pci( + uvm_ctor, + microvm_factory, + guest_kernel, + rootfs, + cpu_template_any, + vcpu_count, + mem_size_mib, +): + """Return booted uvms with PCI disabled""" + return uvm_ctor( + microvm_factory, + guest_kernel, + rootfs, + cpu_template_any, + False, vcpu_count=vcpu_count, mem_size_mib=mem_size_mib, ) diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py index 78f8d669600..3c672e82e23 100644 --- a/tests/framework/microvm.py +++ b/tests/framework/microvm.py @@ -206,6 +206,7 @@ def __init__( jailer_kwargs: Optional[dict] = None, numa_node=None, custom_cpu_template: Path = None, + pci: bool = False, ): """Set up microVM attributes, paths, and data structures.""" # pylint: disable=too-many-statements @@ -236,6 +237,10 @@ def __init__( **jailer_kwargs, ) + self.pci_enabled = pci + if pci: + self.jailer.extra_args["enable-pci"] = None + # Copy the /etc/localtime file in the jailer root self.jailer.jailed_path("/etc/localtime", subdir="etc") @@ -503,6 +508,7 @@ def dimensions(self): "rootfs": self.rootfs_file.name, "vcpus": str(self.vcpus_count), "guest_memory": f"{self.mem_size_bytes / (1024 * 1024)}MB", + "pci": f"{self.pci_enabled}", } @property @@ -800,8 +806,10 @@ def basic_config( the response is within the interval [200, 300). If boot_args is None, the default boot_args in Firecracker is - reboot=k panic=1 pci=off nomodule 8250.nr_uarts=0 - i8042.noaux i8042.nomux i8042.nopnp i8042.dumbkbd + reboot=k panic=1 nomodule 8250.nr_uarts=0 i8042.noaux i8042.nomux + i8042.nopnp i8042.dumbkbd swiotlb=noforce + + if PCI is disabled, Firecracker also passes to the guest pci=off Reference: file:../../src/vmm/src/vmm_config/boot_source.rs::DEFAULT_KERNEL_CMDLINE """ diff --git a/tests/framework/microvm_helpers.py b/tests/framework/microvm_helpers.py index b34da3c447e..f42b63222fb 100644 --- a/tests/framework/microvm_helpers.py +++ b/tests/framework/microvm_helpers.py @@ -127,7 +127,7 @@ def enable_console(self): raise RuntimeError(".spawn already called, too late to enable the console") if self.vm.boot_args is None: self.vm.boot_args = "" - self.vm.boot_args += "console=ttyS0 reboot=k panic=1" + self.vm.boot_args += "console=ttyS0 reboot=k panic=1 swiotlb=noforce" self.vm.jailer.daemonize = False self.vm.jailer.new_pid_ns = False diff --git a/tests/framework/vm_config.json b/tests/framework/vm_config.json index 5df673308d9..6948002e245 100644 --- a/tests/framework/vm_config.json +++ b/tests/framework/vm_config.json @@ -1,7 +1,7 @@ { "boot-source": { "kernel_image_path": "vmlinux.bin", - "boot_args": "console=ttyS0 reboot=k panic=1 pci=off", + "boot_args": "console=ttyS0 reboot=k panic=1", "initrd_path": null }, "drives": [ diff --git a/tests/framework/vm_config_cpu_template_C3.json b/tests/framework/vm_config_cpu_template_C3.json index 3b842594a18..b6dbf124022 100644 --- a/tests/framework/vm_config_cpu_template_C3.json +++ b/tests/framework/vm_config_cpu_template_C3.json @@ -1,7 +1,7 @@ { "boot-source": { "kernel_image_path": "vmlinux.bin", - "boot_args": "console=ttyS0 reboot=k panic=1 pci=off" + "boot_args": "console=ttyS0 reboot=k panic=1" }, "drives": [ { diff --git a/tests/framework/vm_config_missing_mem_size_mib.json b/tests/framework/vm_config_missing_mem_size_mib.json index 15ff19fa1b3..ea20d152473 100644 --- a/tests/framework/vm_config_missing_mem_size_mib.json +++ b/tests/framework/vm_config_missing_mem_size_mib.json @@ -1,7 +1,7 @@ { "boot-source": { "kernel_image_path": "vmlinux.bin", - "boot_args": "console=ttyS0 reboot=k panic=1 pci=off" + "boot_args": "console=ttyS0 reboot=k panic=1" }, "drives": [ { diff --git a/tests/framework/vm_config_missing_vcpu_count.json b/tests/framework/vm_config_missing_vcpu_count.json index b5aac05ddd2..719300c96fa 100644 --- a/tests/framework/vm_config_missing_vcpu_count.json +++ b/tests/framework/vm_config_missing_vcpu_count.json @@ -1,7 +1,7 @@ { "boot-source": { "kernel_image_path": "vmlinux.bin", - "boot_args": "console=ttyS0 reboot=k panic=1 pci=off" + "boot_args": "console=ttyS0 reboot=k panic=1 swiotlb=noforce" }, "drives": [ { diff --git a/tests/framework/vm_config_network.json b/tests/framework/vm_config_network.json index a081e4f6990..7e25823cd66 100644 --- a/tests/framework/vm_config_network.json +++ b/tests/framework/vm_config_network.json @@ -1,7 +1,7 @@ { "boot-source": { "kernel_image_path": "vmlinux.bin", - "boot_args": "console=ttyS0 reboot=k panic=1 pci=off", + "boot_args": "console=ttyS0 reboot=k panic=1", "initrd_path": null }, "drives": [ diff --git a/tests/framework/vm_config_smt_true.json b/tests/framework/vm_config_smt_true.json index 3a1b79a1752..383bf68519a 100644 --- a/tests/framework/vm_config_smt_true.json +++ b/tests/framework/vm_config_smt_true.json @@ -1,7 +1,7 @@ { "boot-source": { "kernel_image_path": "vmlinux.bin", - "boot_args": "console=ttyS0 reboot=k panic=1 pci=off" + "boot_args": "console=ttyS0 reboot=k panic=1" }, "drives": [ { diff --git a/tests/framework/vm_config_with_mmdsv1.json b/tests/framework/vm_config_with_mmdsv1.json index 6c30e535b1d..30f67ff5bfa 100644 --- a/tests/framework/vm_config_with_mmdsv1.json +++ b/tests/framework/vm_config_with_mmdsv1.json @@ -1,7 +1,7 @@ { "boot-source": { "kernel_image_path": "vmlinux.bin", - "boot_args": "console=ttyS0 reboot=k panic=1 pci=off", + "boot_args": "console=ttyS0 reboot=k panic=1", "initrd_path": null }, "drives": [ diff --git a/tests/framework/vm_config_with_mmdsv2.json b/tests/framework/vm_config_with_mmdsv2.json index b5855b9faa4..f766129f02f 100644 --- a/tests/framework/vm_config_with_mmdsv2.json +++ b/tests/framework/vm_config_with_mmdsv2.json @@ -1,7 +1,7 @@ { "boot-source": { "kernel_image_path": "vmlinux.bin", - "boot_args": "console=ttyS0 reboot=k panic=1 pci=off", + "boot_args": "console=ttyS0 reboot=k panic=1", "initrd_path": null }, "drives": [ diff --git a/tests/host_tools/change_net_config_space.c b/tests/host_tools/change_net_config_space.c index 7b803bdc878..592a0cfe6af 100644 --- a/tests/host_tools/change_net_config_space.c +++ b/tests/host_tools/change_net_config_space.c @@ -14,7 +14,7 @@ #include int show_usage() { - printf("Usage: ./change_net_config_space.bin [dev_addr_start] [mac_addr]\n"); + printf("Usage: ./change_net_config_space.bin [dev_addr] [mac_addr]\n"); printf("Example:\n"); printf("> ./change_net_config_space.bin 0xd00001000 0x060504030201\n"); return 0; @@ -25,18 +25,17 @@ int main(int argc, char *argv[]) { uint8_t *map_base; volatile uint8_t *virt_addr; - uint64_t mapped_size, page_size, offset_in_page, target; + uint64_t mapped_size, page_size, page_addr, offset_in_page; uint64_t width = 6; - uint64_t config_offset = 0x100; - uint64_t device_start_addr = 0x00000000; + uint64_t dev_addr = 0x00000000; uint64_t mac = 0; if (argc != 3) { return show_usage(); } - device_start_addr = strtoull(argv[1], NULL, 0); + dev_addr = strtoull(argv[1], NULL, 0); mac = strtoull(argv[2], NULL, 0); fd = open("/dev/mem", O_RDWR | O_SYNC); @@ -45,11 +44,11 @@ int main(int argc, char *argv[]) { return 1; } - target = device_start_addr + config_offset; // Get the page size. mapped_size = page_size = getpagesize(); // Get the target address physical frame page offset. - offset_in_page = (unsigned) target & (page_size - 1); + offset_in_page = (unsigned) dev_addr & (page_size - 1); + page_addr = dev_addr & ~(page_size - 1); /* If the data length goes out of the current page, * double the needed map size. */ if (offset_in_page + width > page_size) { @@ -64,7 +63,8 @@ int main(int argc, char *argv[]) { PROT_READ | PROT_WRITE, MAP_SHARED, fd, - target & ~(off_t)(page_size - 1)); + page_addr + ); if (map_base == MAP_FAILED) { perror("Failed to mmap '/dev/mem'."); return 2; diff --git a/tests/host_tools/memory.py b/tests/host_tools/memory.py index 93380a9321d..d9c2a01fe06 100644 --- a/tests/host_tools/memory.py +++ b/tests/host_tools/memory.py @@ -8,6 +8,8 @@ import psutil +from framework.properties import global_props + class MemoryUsageExceededError(Exception): """A custom exception containing details on excessive memory usage.""" @@ -15,8 +17,8 @@ class MemoryUsageExceededError(Exception): def __init__(self, usage, threshold, *args): """Compose the error message containing the memory consumption.""" super().__init__( - f"Memory usage ({usage / 2**20:.2f} MiB) exceeded maximum threshold " - f"({threshold / 2**20} MiB)", + f"Memory usage ({usage / 1 << 20:.2f} MiB) exceeded maximum threshold " + f"({threshold / 1 << 20} MiB)", *args, ) @@ -28,10 +30,20 @@ class MemoryMonitor(Thread): VMM memory usage. """ - # If guest memory is >3328MB, it is split in a 2nd region - X86_MEMORY_GAP_START = 3328 * 2**20 - - def __init__(self, vm, threshold=5 * 2**20, period_s=0.05): + # If guest memory is >3GiB, it is split in a 2nd region + # Gap starts at 3GiBs and is 1GiB long + X86_32BIT_MEMORY_GAP_START = 3 << 30 + X86_32BIT_MEMORY_GAP_SIZE = 1 << 30 + # If guest memory is >255GiB, it is split in a 3rd region + # Gap starts at 256 GiB and is 256GiB long + X86_64BIT_MEMORY_GAP_START = 256 << 30 + # On ARM64 we just have a single gap, but memory starts at an offset + # Gap starts at 256 GiB and is GiB long + # Memory starts at 2GiB + ARM64_64BIT_MEMORY_GAP_START = 256 << 30 + ARM64_MEMORY_START = 2 << 30 + + def __init__(self, vm, threshold=5 << 20, period_s=0.01): """Initialize monitor attributes.""" Thread.__init__(self) self._vm = vm @@ -73,6 +85,7 @@ def run(self): for mmap in mmaps: if self.is_guest_mem(mmap.size, guest_mem_bytes): continue + mem_total += mmap.rss self._current_rss = mem_total if mem_total > self.threshold: @@ -81,24 +94,55 @@ def run(self): time.sleep(self._period_s) - def is_guest_mem(self, size, guest_mem_bytes): + def is_guest_mem_x86(self, size, guest_mem_bytes): """ - If the address is recognised as a guest memory region, - return True, otherwise return False. + Checks if a region is a guest memory region based on + x86_64 physical memory layout """ + return size in ( + # memory fits before the first gap + guest_mem_bytes, + # guest memory spans at least two regions & memory fits before the second gap + self.X86_32BIT_MEMORY_GAP_START, + # guest memory spans exactly two regions + guest_mem_bytes - self.X86_32BIT_MEMORY_GAP_START, + # guest memory fills the space between the two gaps + self.X86_64BIT_MEMORY_GAP_START + - self.X86_32BIT_MEMORY_GAP_START + - self.X86_32BIT_MEMORY_GAP_SIZE, + # guest memory spans 3 regions, this is what remains past the second gap + guest_mem_bytes + - self.X86_64BIT_MEMORY_GAP_START + + self.X86_32BIT_MEMORY_GAP_SIZE, + ) - # If x86_64 guest memory exceeds 3328M, it will be split - # in 2 regions: 3328M and the rest. We have 3 cases here - # to recognise a guest memory region: - # - its size matches the guest memory exactly - # - its size is 3328M - # - its size is guest memory minus 3328M. + def is_guest_mem_arch64(self, size, guest_mem_bytes): + """ + Checks if a region is a guest memory region based on + ARM64 physical memory layout + """ return size in ( + # guest memory fits before the gap guest_mem_bytes, - self.X86_MEMORY_GAP_START, - guest_mem_bytes - self.X86_MEMORY_GAP_START, + # guest memory fills the space before the gap + self.ARM64_64BIT_MEMORY_GAP_START - self.ARM64_MEMORY_START, + # guest memory spans 2 regions, this is what remains past the gap + guest_mem_bytes + - self.ARM64_64BIT_MEMORY_GAP_START + + self.ARM64_MEMORY_START, ) + def is_guest_mem(self, size, guest_mem_bytes): + """ + If the address is recognised as a guest memory region, + return True, otherwise return False. + """ + + if global_props.cpu_architecture == "x86_64": + return self.is_guest_mem_x86(size, guest_mem_bytes) + + return self.is_guest_mem_arch64(size, guest_mem_bytes) + def check_samples(self): """Check that there are no samples over the threshold.""" if self._exceeded is not None: diff --git a/tests/integration_tests/functional/test_api.py b/tests/integration_tests/functional/test_api.py index e241d4ef1c7..55bb15d5eb4 100644 --- a/tests/integration_tests/functional/test_api.py +++ b/tests/integration_tests/functional/test_api.py @@ -772,7 +772,7 @@ def test_send_ctrl_alt_del(uvm_plain_any): def _drive_patch(test_microvm, io_engine): """Exercise drive patch test scenarios.""" # Patches without mandatory fields for virtio block are not allowed. - expected_msg = "Unable to patch the block device: Device manager error: Running method expected different backend. Please verify the request arguments" + expected_msg = "Running method expected different backend." with pytest.raises(RuntimeError, match=expected_msg): test_microvm.api.drive.patch(drive_id="scratch") @@ -814,7 +814,7 @@ def _drive_patch(test_microvm, io_engine): ) # Updates to `path_on_host` with an invalid path are not allowed. - expected_msg = f"Unable to patch the block device: Device manager error: Virtio backend error: Error manipulating the backing file: No such file or directory (os error 2) {drive_path} Please verify the request arguments" + expected_msg = f"Error manipulating the backing file: No such file or directory (os error 2) {drive_path}" with pytest.raises(RuntimeError, match=re.escape(expected_msg)): test_microvm.api.drive.patch(drive_id="scratch", path_on_host=drive_path) diff --git a/tests/integration_tests/functional/test_balloon.py b/tests/integration_tests/functional/test_balloon.py index d23dc0785cb..314cd9b5afd 100644 --- a/tests/integration_tests/functional/test_balloon.py +++ b/tests/integration_tests/functional/test_balloon.py @@ -449,11 +449,11 @@ def test_stats_update(uvm_plain_any): assert next_stats["available_memory"] != final_stats["available_memory"] -def test_balloon_snapshot(microvm_factory, guest_kernel, rootfs): +def test_balloon_snapshot(uvm_plain_any, microvm_factory): """ Test that the balloon works after pause/resume. """ - vm = microvm_factory.build(guest_kernel, rootfs) + vm = uvm_plain_any vm.spawn() vm.basic_config( vcpu_count=2, @@ -531,11 +531,11 @@ def test_balloon_snapshot(microvm_factory, guest_kernel, rootfs): assert stats_after_snap["available_memory"] > latest_stats["available_memory"] -def test_memory_scrub(microvm_factory, guest_kernel, rootfs): +def test_memory_scrub(uvm_plain_any): """ Test that the memory is zeroed after deflate. """ - microvm = microvm_factory.build(guest_kernel, rootfs) + microvm = uvm_plain_any microvm.spawn() microvm.basic_config(vcpu_count=2, mem_size_mib=256) microvm.add_net_iface() diff --git a/tests/integration_tests/functional/test_concurrency.py b/tests/integration_tests/functional/test_concurrency.py index e4756729f2b..15394ec6ada 100644 --- a/tests/integration_tests/functional/test_concurrency.py +++ b/tests/integration_tests/functional/test_concurrency.py @@ -7,13 +7,13 @@ NO_OF_MICROVMS = 20 -def test_run_concurrency(microvm_factory, guest_kernel, rootfs): +def test_run_concurrency(microvm_factory, guest_kernel, rootfs, pci_enabled): """ Check we can spawn multiple microvms. """ def launch1(): - microvm = microvm_factory.build(guest_kernel, rootfs) + microvm = microvm_factory.build(guest_kernel, rootfs, pci=pci_enabled) microvm.time_api_requests = False # is flaky because of parallelism microvm.spawn() microvm.basic_config(vcpu_count=1, mem_size_mib=128) diff --git a/tests/integration_tests/functional/test_cpu_features_host_vs_guest.py b/tests/integration_tests/functional/test_cpu_features_host_vs_guest.py index 955a70bd38b..78ea0380f1b 100644 --- a/tests/integration_tests/functional/test_cpu_features_host_vs_guest.py +++ b/tests/integration_tests/functional/test_cpu_features_host_vs_guest.py @@ -91,7 +91,6 @@ "cqm_occup_llc", "decodeassists", "extapic", - "extd_apicid", "flushbyasid", "hw_pstate", "ibs", diff --git a/tests/integration_tests/functional/test_cpu_template_helper.py b/tests/integration_tests/functional/test_cpu_template_helper.py index e4c087fa497..74f5c96cd47 100644 --- a/tests/integration_tests/functional/test_cpu_template_helper.py +++ b/tests/integration_tests/functional/test_cpu_template_helper.py @@ -266,9 +266,7 @@ def get_guest_msrs(microvm, msr_index_list): ), ) def test_cpu_config_dump_vs_actual( - microvm_factory, - guest_kernel, - rootfs, + uvm_plain_any, cpu_template_helper, tmp_path, ): @@ -282,7 +280,7 @@ def test_cpu_config_dump_vs_actual( dump_cpu_config = build_cpu_config_dict(cpu_config_path) # Retrieve actual CPU config from guest - microvm = microvm_factory.build(guest_kernel, rootfs) + microvm = uvm_plain_any microvm.spawn() microvm.basic_config(vcpu_count=1) microvm.add_net_iface() diff --git a/tests/integration_tests/functional/test_drive_vhost_user.py b/tests/integration_tests/functional/test_drive_vhost_user.py index 79cc41b0f3a..07fcafb715e 100644 --- a/tests/integration_tests/functional/test_drive_vhost_user.py +++ b/tests/integration_tests/functional/test_drive_vhost_user.py @@ -6,11 +6,62 @@ import shutil from pathlib import Path +import pytest + import host_tools.drive as drive_tools from framework.utils_drive import partuuid_and_disk_path from host_tools.fcmetrics import FcDeviceMetrics +@pytest.fixture +def uvm_vhost_user_plain_any(microvm_factory, guest_kernel, pci_enabled): + """Builds a plain VM with no root volume""" + return microvm_factory.build( + guest_kernel, None, pci=pci_enabled, monitor_memory=False + ) + + +@pytest.fixture +def uvm_vhost_user_booted_ro(uvm_vhost_user_plain_any, rootfs): + """Returns a VM with a vhost-user rootfs""" + vm = uvm_vhost_user_plain_any + + # We need to setup ssh keys manually because we did not specify rootfs + # in microvm_factory.build method + ssh_key = rootfs.with_suffix(".id_rsa") + vm.ssh_key = ssh_key + vm.spawn() + vm.basic_config(add_root_device=False) + vm.add_vhost_user_drive("rootfs", rootfs, is_root_device=True, is_read_only=True) + vm.add_net_iface() + vm.start() + + return vm + + +@pytest.fixture +def uvm_vhost_user_booted_rw(uvm_vhost_user_plain_any, rootfs): + """Returns a VM with a vhost-user rootfs""" + vm = uvm_vhost_user_plain_any + + # We need to setup ssh keys manually because we did not specify rootfs + # in microvm_factory.build method + ssh_key = rootfs.with_suffix(".id_rsa") + vm.ssh_key = ssh_key + vm.spawn() + vm.basic_config(add_root_device=False) + # Create a rw rootfs file that is unique to the microVM + rootfs_rw = Path(vm.chroot()) / "rootfs" + shutil.copy(rootfs, rootfs_rw) + vm.add_vhost_user_drive( + "rootfs", rootfs_rw, is_root_device=True, is_read_only=False + ) + vm.add_net_iface() + vm.start() + + return vm + + def _check_block_size(ssh_connection, dev_path, size): """ Checks the size of the block device. @@ -34,26 +85,16 @@ def _check_drives(test_microvm, assert_dict, keys_array): assert blockdev_out_line_cols[col] == assert_dict[key] -def test_vhost_user_block(microvm_factory, guest_kernel, rootfs): +def test_vhost_user_block(uvm_vhost_user_booted_ro): """ This test simply tries to boot a VM with vhost-user-block as a root device. """ - vm = microvm_factory.build(guest_kernel, None, monitor_memory=False) - - # We need to setup ssh keys manually because we did not specify rootfs - # in microvm_factory.build method - ssh_key = rootfs.with_suffix(".id_rsa") - vm.ssh_key = ssh_key - vm.spawn() - vm.basic_config(add_root_device=False) - vm.add_vhost_user_drive("rootfs", rootfs, is_root_device=True, is_read_only=True) - vm.add_net_iface() + vm = uvm_vhost_user_booted_ro vhost_user_block_metrics = FcDeviceMetrics( "vhost_user_block", 1, aggr_supported=False ) - vm.start() # Now check that vhost-user-block with rw is last. # 1-0 means line 1, column 0. @@ -65,29 +106,14 @@ def test_vhost_user_block(microvm_factory, guest_kernel, rootfs): vhost_user_block_metrics.validate(vm) -def test_vhost_user_block_read_write(microvm_factory, guest_kernel, rootfs): +def test_vhost_user_block_read_write(uvm_vhost_user_booted_rw): """ This test simply tries to boot a VM with vhost-user-block as a root device. This test configures vhost-user-block to be read write. """ - vm = microvm_factory.build(guest_kernel, None, monitor_memory=False) - - # We need to setup ssh keys manually because we did not specify rootfs - # in microvm_factory.build method - ssh_key = rootfs.with_suffix(".id_rsa") - vm.ssh_key = ssh_key - vm.spawn() - vm.basic_config(add_root_device=False) - - # Create a rw rootfs file that is unique to the microVM - rootfs_rw = Path(vm.chroot()) / "rootfs" - shutil.copy(rootfs, rootfs_rw) - - vm.add_vhost_user_drive("rootfs", rootfs_rw, is_root_device=True) - vm.add_net_iface() - vm.start() + vm = uvm_vhost_user_booted_rw # Now check that vhost-user-block with rw is last. # 1-0 means line 1, column 0. @@ -98,22 +124,12 @@ def test_vhost_user_block_read_write(microvm_factory, guest_kernel, rootfs): _check_drives(vm, assert_dict, assert_dict.keys()) -def test_vhost_user_block_disconnect(microvm_factory, guest_kernel, rootfs): +def test_vhost_user_block_disconnect(uvm_vhost_user_booted_ro): """ Test that even if backend is killed, Firecracker is still responsive. """ - vm = microvm_factory.build(guest_kernel, None, monitor_memory=False) - - # We need to set up ssh keys manually because we did not specify rootfs - # in microvm_factory.build method - ssh_key = rootfs.with_suffix(".id_rsa") - vm.ssh_key = ssh_key - vm.spawn() - vm.basic_config(add_root_device=False) - vm.add_vhost_user_drive("rootfs", rootfs, is_root_device=True, is_read_only=True) - vm.add_net_iface() - vm.start() + vm = uvm_vhost_user_booted_ro # Killing the backend vm.disks_vhost_user["rootfs"].kill() @@ -123,7 +139,7 @@ def test_vhost_user_block_disconnect(microvm_factory, guest_kernel, rootfs): _config = vm.api.vm_config.get().json() -def test_device_ordering(microvm_factory, guest_kernel, rootfs): +def test_device_ordering(uvm_vhost_user_plain_any, rootfs): """ Verify device ordering. @@ -131,7 +147,7 @@ def test_device_ordering(microvm_factory, guest_kernel, rootfs): the order of the other devices should match their configuration order. """ - vm = microvm_factory.build(guest_kernel, None, monitor_memory=False) + vm = uvm_vhost_user_plain_any # We need to setup ssh keys manually because we did not specify rootfs # in microvm_factory.build method @@ -194,16 +210,12 @@ def test_device_ordering(microvm_factory, guest_kernel, rootfs): vhost_user_block_metrics.validate(vm) -def test_partuuid_boot( - microvm_factory, - guest_kernel, - rootfs, -): +def test_partuuid_boot(uvm_vhost_user_plain_any, rootfs): """ Test the output reported by blockdev when booting with PARTUUID. """ - vm = microvm_factory.build(guest_kernel, None, monitor_memory=False) + vm = uvm_vhost_user_plain_any # We need to setup ssh keys manually because we did not specify rootfs # in microvm_factory.build method @@ -230,12 +242,12 @@ def test_partuuid_boot( _check_drives(vm, assert_dict, assert_dict.keys()) -def test_partuuid_update(microvm_factory, guest_kernel, rootfs): +def test_partuuid_update(uvm_vhost_user_plain_any, rootfs): """ Test successful switching from PARTUUID boot to /dev/vda boot. """ - vm = microvm_factory.build(guest_kernel, None, monitor_memory=False) + vm = uvm_vhost_user_plain_any # We need to setup ssh keys manually because we did not specify rootfs # in microvm_factory.build method @@ -272,7 +284,7 @@ def test_partuuid_update(microvm_factory, guest_kernel, rootfs): vhost_user_block_metrics.validate(vm) -def test_config_change(microvm_factory, guest_kernel, rootfs): +def test_config_change(uvm_plain_any): """ Verify handling of block device resize. We expect that the guest will start reporting the updated size @@ -283,7 +295,7 @@ def test_config_change(microvm_factory, guest_kernel, rootfs): new_sizes = [20, 10, 30] # MB mkfs_mount_cmd = "mkfs.ext4 /dev/vdb && mkdir -p /tmp/tmp && mount /dev/vdb /tmp/tmp && umount /tmp/tmp" - vm = microvm_factory.build(guest_kernel, rootfs, monitor_memory=False) + vm = uvm_plain_any vm.spawn(log_level="Info") vm.basic_config() vm.add_net_iface() diff --git a/tests/integration_tests/functional/test_error_code.py b/tests/integration_tests/functional/test_error_code.py index d1a74b6f418..321c251cc93 100644 --- a/tests/integration_tests/functional/test_error_code.py +++ b/tests/integration_tests/functional/test_error_code.py @@ -25,7 +25,7 @@ def test_enosys_error_code(uvm_plain): vm.memory_monitor = None vm.basic_config( vcpu_count=1, - boot_args="reboot=k panic=1 pci=off init=/usr/local/bin/devmemread", + boot_args="reboot=k panic=1 swiotlb=noforce init=/usr/local/bin/devmemread", ) vm.start() diff --git a/tests/integration_tests/functional/test_feat_parity.py b/tests/integration_tests/functional/test_feat_parity.py index 1eadbc6d29c..9fc89ffcd2c 100644 --- a/tests/integration_tests/functional/test_feat_parity.py +++ b/tests/integration_tests/functional/test_feat_parity.py @@ -28,16 +28,11 @@ def inst_set_cpu_template_fxt(request): @pytest.fixture(name="vm") -def vm_fxt( - microvm_factory, - inst_set_cpu_template, - guest_kernel, - rootfs, -): +def vm_fxt(uvm_plain_any, inst_set_cpu_template): """ Create a VM, using the normal CPU templates """ - vm = microvm_factory.build(guest_kernel, rootfs) + vm = uvm_plain_any vm.spawn() vm.basic_config(vcpu_count=1, mem_size_mib=1024, cpu_template=inst_set_cpu_template) vm.add_net_iface() diff --git a/tests/integration_tests/functional/test_kernel_cmdline.py b/tests/integration_tests/functional/test_kernel_cmdline.py index 9707eb8a92c..e4e4c122aa9 100644 --- a/tests/integration_tests/functional/test_kernel_cmdline.py +++ b/tests/integration_tests/functional/test_kernel_cmdline.py @@ -21,8 +21,7 @@ def test_init_params(uvm_plain): # Ubuntu version from the /etc/issue file. vm.basic_config( vcpu_count=1, - boot_args="console=ttyS0 reboot=k panic=1 pci=off" - " init=/bin/cat -- /etc/issue", + boot_args="console=ttyS0 reboot=k panic=1 swiotlb=noforce init=/bin/cat -- /etc/issue", ) vm.start() diff --git a/tests/integration_tests/functional/test_max_devices.py b/tests/integration_tests/functional/test_max_devices.py index 85cf2f1399c..7cf9922c77b 100644 --- a/tests/integration_tests/functional/test_max_devices.py +++ b/tests/integration_tests/functional/test_max_devices.py @@ -6,63 +6,80 @@ import pytest -# On x86_64, IRQs are available from 5 to 23. We always use one IRQ for VMGenID -# device, so the maximum number of devices supported at the same time is 18. -# On aarch64, IRQs are available from 32 to 127. We always use one IRQ each for -# the VMGenID and RTC devices, so the maximum number of devices supported -# at the same time is 94. -MAX_DEVICES_ATTACHED = {"x86_64": 18, "aarch64": 94}.get(platform.machine()) - - -def test_attach_maximum_devices(microvm_factory, guest_kernel, rootfs): +def max_devices(uvm): + """ + Returns the maximum number of devices supported by the platform. + """ + if uvm.pci_enabled: + # On PCI, we only have one bus, so 32 minus the bus itself + return 31 + + match platform.machine(): + case "aarch64": + # On aarch64, IRQs are available from 32 to 127. We always use one IRQ each for + # the VMGenID and RTC devices, so the maximum number of devices supported + # at the same time is 94. + return 94 + case "x86_64": + # IRQs are available from 5 to 23. We always use one IRQ for VMGenID device, so + # the maximum number of devices supported at the same time is 18. + return 18 + case _: + raise ValueError("Unknown platform") + + +def test_attach_maximum_devices(uvm_plain_any): """ Test attaching maximum number of devices to the microVM. """ - if MAX_DEVICES_ATTACHED is None: - pytest.skip("Unsupported platform for this test.") - - test_microvm = microvm_factory.build(guest_kernel, rootfs, monitor_memory=False) + test_microvm = uvm_plain_any + test_microvm.memory_monitor = None test_microvm.spawn() # The default 256mib is not enough for 94 ssh connections on aarch64. test_microvm.basic_config(mem_size_mib=512) + max_devices_attached = max_devices(test_microvm) # Add (`MAX_DEVICES_ATTACHED` - 1) devices because the rootfs # has already been configured in the `basic_config()`function. - for _ in range(MAX_DEVICES_ATTACHED - 1): + for _ in range(max_devices_attached - 1): test_microvm.add_net_iface() test_microvm.start() # Test that network devices attached are operational. - for i in range(MAX_DEVICES_ATTACHED - 1): + for i in range(max_devices_attached - 1): # Verify if guest can run commands. test_microvm.ssh_iface(i).check_output("sync") -def test_attach_too_many_devices(microvm_factory, guest_kernel, rootfs): +def test_attach_too_many_devices(uvm_plain): """ Test attaching to a microVM more devices than available IRQs. """ - if MAX_DEVICES_ATTACHED is None: - pytest.skip("Unsupported platform for this test.") - - test_microvm = microvm_factory.build(guest_kernel, rootfs, monitor_memory=False) + test_microvm = uvm_plain + test_microvm.memory_monitor = None test_microvm.spawn() # Set up a basic microVM. test_microvm.basic_config() + max_devices_attached = max_devices(test_microvm) + # Add `MAX_DEVICES_ATTACHED` network devices on top of the # already configured rootfs. - for _ in range(MAX_DEVICES_ATTACHED): + for _ in range(max_devices_attached): test_microvm.add_net_iface() # Attempting to start a microVM with more than # `MAX_DEVICES_ATTACHED` devices should fail. error_str = ( - "Failed to allocate requested resource: The requested resource" - " is not available." + ("Could not find an available device slot on the PCI bus.") + if test_microvm.pci_enabled + else ( + "Failed to allocate requested resource: The requested resource" + " is not available." + ) ) with pytest.raises(RuntimeError, match=error_str): test_microvm.start() diff --git a/tests/integration_tests/functional/test_net.py b/tests/integration_tests/functional/test_net.py index 7abf23406d5..10467affac8 100644 --- a/tests/integration_tests/functional/test_net.py +++ b/tests/integration_tests/functional/test_net.py @@ -85,9 +85,9 @@ def test_multi_queue_unsupported(uvm_plain): @pytest.fixture -def uvm_any(microvm_factory, uvm_ctor, guest_kernel, rootfs): +def uvm_any(microvm_factory, uvm_ctor, guest_kernel, rootfs, pci_enabled): """Return booted and restored uvm with no CPU templates""" - return uvm_ctor(microvm_factory, guest_kernel, rootfs, None) + return uvm_ctor(microvm_factory, guest_kernel, rootfs, None, pci_enabled) def test_tap_offload(uvm_any): diff --git a/tests/integration_tests/functional/test_net_config_space.py b/tests/integration_tests/functional/test_net_config_space.py index c4ddfea9189..d58b49b6d4a 100644 --- a/tests/integration_tests/functional/test_net_config_space.py +++ b/tests/integration_tests/functional/test_net_config_space.py @@ -2,9 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 """Tests on devices config space.""" -import platform import random -import re import string import subprocess from threading import Thread @@ -64,6 +62,8 @@ def test_net_change_mac_address(uvm_plain_any, change_net_config_space_bin): net_addr_base = _get_net_mem_addr_base(ssh_conn, guest_if1_name) assert net_addr_base is not None + config_offset = 0x4000 if test_microvm.pci_enabled else 0x100 + dev_addr = net_addr_base + config_offset # Write into '/dev/mem' the same mac address, byte by byte. # This changes the MAC address physically, in the network device registers. @@ -72,7 +72,7 @@ def test_net_change_mac_address(uvm_plain_any, change_net_config_space_bin): # `tx_spoofed_mac_count` metric shouldn't be incremented later on. rmt_path = "/tmp/change_net_config_space" test_microvm.ssh.scp_put(change_net_config_space_bin, rmt_path) - cmd = f"chmod u+x {rmt_path} && {rmt_path} {net_addr_base} {mac_hex}" + cmd = f"chmod u+x {rmt_path} && {rmt_path} {dev_addr} {mac_hex}" # This should be executed successfully. _, stdout, _ = ssh_conn.check_output(cmd) @@ -219,8 +219,7 @@ def _find_iomem_range(ssh_connection, dev_name): # its contents and grep for the VirtIO device name, which # with ACPI is "LNRO0005:XY". cmd = f"cat /proc/iomem | grep -m 1 {dev_name}" - rc, stdout, stderr = ssh_connection.run(cmd) - assert rc == 0, stderr + _, stdout, _ = ssh_connection.check_output(cmd) # Take range in the form 'start-end' from line. The line looks like this: # d00002000-d0002fff : LNRO0005:02 @@ -231,89 +230,16 @@ def _find_iomem_range(ssh_connection, dev_name): return (int(tokens[0], 16), int(tokens[1], 16)) -def _get_net_mem_addr_base_x86_acpi(ssh_connection, if_name): - """Check for net device memory start address via ACPI info""" - # On x86 we define VirtIO devices through ACPI AML bytecode. VirtIO devices - # are identified as "LNRO0005" and appear under /sys/devices/platform - sys_virtio_mmio_cmdline = "/sys/devices/platform/" - cmd = "ls {}" - _, stdout, _ = ssh_connection.check_output(cmd.format(sys_virtio_mmio_cmdline)) - virtio_devs = list(filter(lambda x: "LNRO0005" in x, stdout.strip().split())) - - # For virtio-net LNRO0005 devices, we should have a path like: - # /sys/devices/platform/LNRO0005::XY/virtioXY/net which is a directory - # that includes a subdirectory `ethZ` which represents the network device - # that corresponds to the virtio-net device. - cmd = "ls {}/{}/virtio{}/net" - for idx, dev in enumerate(virtio_devs): - _, guest_if_name, _ = ssh_connection.run( - cmd.format(sys_virtio_mmio_cmdline, dev, idx) - ) - if guest_if_name.strip() == if_name: - return _find_iomem_range(ssh_connection, dev)[0] - - return None - - -def _get_net_mem_addr_base_x86_cmdline(ssh_connection, if_name): - """Check for net device memory start address via command line arguments""" - sys_virtio_mmio_cmdline = "/sys/devices/virtio-mmio-cmdline/" - cmd = "ls {} | grep virtio-mmio. | sed 's/virtio-mmio.//'" - exit_code, stdout, stderr = ssh_connection.run(cmd.format(sys_virtio_mmio_cmdline)) - assert exit_code == 0, stderr - virtio_devs_idx = stdout.strip().split() - - cmd = "cat /proc/cmdline" - _, cmd_line, _ = ssh_connection.check_output(cmd) - pattern_dev = re.compile("(virtio_mmio.device=4K@0x[0-9a-f]+:[0-9]+)+") - pattern_addr = re.compile("virtio_mmio.device=4K@(0x[0-9a-f]+):[0-9]+") - devs_addr = [] - for dev in re.findall(pattern_dev, cmd_line): - matched_addr = pattern_addr.search(dev) - # The 1st group which matches this pattern - # is the device start address. `0` group is - # full match - addr = matched_addr.group(1) - devs_addr.append(addr) - - cmd = "ls {}/virtio-mmio.{}/virtio{}/net" - for idx in virtio_devs_idx: - _, guest_if_name, _ = ssh_connection.run( - cmd.format(sys_virtio_mmio_cmdline, idx, idx) - ) - if guest_if_name.strip() == if_name: - return devs_addr[int(idx)] - - return None - - def _get_net_mem_addr_base(ssh_connection, if_name): """Get the net device memory start address.""" - if platform.machine() == "x86_64": - acpi_info = _get_net_mem_addr_base_x86_acpi(ssh_connection, if_name) - if acpi_info is not None: - return acpi_info - - return _get_net_mem_addr_base_x86_cmdline(ssh_connection, if_name) - - if platform.machine() == "aarch64": - sys_virtio_mmio_cmdline = "/sys/devices/platform" - cmd = "ls {} | grep .virtio_mmio".format(sys_virtio_mmio_cmdline) - rc, stdout, _ = ssh_connection.run(cmd) - assert rc == 0 - - virtio_devs = stdout.split() - devs_addr = list(map(lambda dev: dev.split(".")[0], virtio_devs)) - - cmd = "ls {}/{}/virtio{}/net" - # Device start addresses lack the hex prefix and are not interpreted - # accordingly when parsed inside `change_config_space.c`. - hex_prefix = "0x" - for idx, dev in enumerate(virtio_devs): - _, guest_if_name, _ = ssh_connection.run( - cmd.format(sys_virtio_mmio_cmdline, dev, idx) - ) - if guest_if_name.strip() == if_name: - return hex_prefix + devs_addr[int(idx)] - - return None + _, stdout, _ = ssh_connection.check_output(f"find /sys/devices -name {if_name}") + device_paths = stdout.strip().split("\n") + assert ( + len(device_paths) == 1 + ), f"No or multiple devices found for {if_name}:\n{stdout}" + device_path = device_paths[0] + parts = device_path.split("/") + assert len(parts) >= 6, f"Unexpected device path: {device_path}" + device = parts[-4] + start_addr, _ = _find_iomem_range(ssh_connection, device) + return start_addr diff --git a/tests/integration_tests/functional/test_pci.py b/tests/integration_tests/functional/test_pci.py new file mode 100644 index 00000000000..dc0827b1aae --- /dev/null +++ b/tests/integration_tests/functional/test_pci.py @@ -0,0 +1,28 @@ +# Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Tests for the PCI devices""" + + +def test_pci_root_present(uvm_any_with_pci): + """ + Test that a guest with PCI enabled has a PCI root device. + """ + + vm = uvm_any_with_pci + devices = vm.ssh.run("lspci").stdout.strip().split("\n") + print(devices) + assert devices[0].startswith( + "00:00.0 Host bridge: Intel Corporation Device" + ), "PCI root not found in guest" + + +def test_pci_disabled(uvm_any_without_pci): + """ + Test that a guest with PCI disabled does not have a PCI root device but still works. + """ + + vm = uvm_any_without_pci + _, stdout, _ = vm.ssh.run("lspci") + assert ( + "00:00.0 Host bridge: Intel Corporation Device" not in stdout + ), "PCI root not found in guest" diff --git a/tests/integration_tests/functional/test_rng.py b/tests/integration_tests/functional/test_rng.py index 1893230c51a..8719472a121 100644 --- a/tests/integration_tests/functional/test_rng.py +++ b/tests/integration_tests/functional/test_rng.py @@ -8,9 +8,10 @@ from host_tools.network import SSHConnection -def uvm_with_rng_booted(microvm_factory, guest_kernel, rootfs, rate_limiter): +def uvm_with_rng_booted(uvm_plain_any, microvm_factory, rate_limiter): """Return a booted microvm with virtio-rng configured""" - uvm = microvm_factory.build(guest_kernel, rootfs) + # pylint: disable=unused-argument + uvm = uvm_plain_any uvm.spawn(log_level="INFO") uvm.basic_config(vcpu_count=2, mem_size_mib=256) uvm.add_net_iface() @@ -21,9 +22,9 @@ def uvm_with_rng_booted(microvm_factory, guest_kernel, rootfs, rate_limiter): return uvm -def uvm_with_rng_restored(microvm_factory, guest_kernel, rootfs, rate_limiter): +def uvm_with_rng_restored(uvm_plain_any, microvm_factory, rate_limiter): """Return a restored uvm with virtio-rng configured""" - uvm = uvm_with_rng_booted(microvm_factory, guest_kernel, rootfs, rate_limiter) + uvm = uvm_with_rng_booted(uvm_plain_any, microvm_factory, rate_limiter) snapshot = uvm.snapshot_full() uvm.kill() uvm2 = microvm_factory.build_from_snapshot(snapshot) @@ -44,9 +45,9 @@ def rate_limiter(request): @pytest.fixture -def uvm_any(microvm_factory, uvm_ctor, guest_kernel, rootfs, rate_limiter): +def uvm_any(microvm_factory, uvm_ctor, uvm_plain_any, rate_limiter): """Return booted and restored uvms""" - return uvm_ctor(microvm_factory, guest_kernel, rootfs, rate_limiter) + return uvm_ctor(uvm_plain_any, microvm_factory, rate_limiter) def list_rng_available(ssh_connection: SSHConnection) -> list[str]: diff --git a/tests/integration_tests/functional/test_serial_io.py b/tests/integration_tests/functional/test_serial_io.py index 7d7939a064e..353496576e4 100644 --- a/tests/integration_tests/functional/test_serial_io.py +++ b/tests/integration_tests/functional/test_serial_io.py @@ -55,7 +55,7 @@ def test_serial_after_snapshot(uvm_plain, microvm_factory): microvm.basic_config( vcpu_count=2, mem_size_mib=256, - boot_args="console=ttyS0 reboot=k panic=1 pci=off", + boot_args="console=ttyS0 reboot=k panic=1 swiotlb=noforce", ) serial = Serial(microvm) serial.open() @@ -100,7 +100,7 @@ def test_serial_console_login(uvm_plain_any): # Set up the microVM with 1 vCPU and a serial console. microvm.basic_config( - vcpu_count=1, boot_args="console=ttyS0 reboot=k panic=1 pci=off" + vcpu_count=1, boot_args="console=ttyS0 reboot=k panic=1 swiotlb=noforce" ) microvm.start() @@ -146,8 +146,9 @@ def test_serial_dos(uvm_plain_any): # Set up the microVM with 1 vCPU and a serial console. microvm.basic_config( vcpu_count=1, - boot_args="console=ttyS0 reboot=k panic=1 pci=off", + boot_args="console=ttyS0 reboot=k panic=1 swiotlb=noforce", ) + microvm.add_net_iface() microvm.start() # Open an fd for firecracker process terminal. @@ -179,7 +180,7 @@ def test_serial_block(uvm_plain_any): test_microvm.basic_config( vcpu_count=1, mem_size_mib=512, - boot_args="console=ttyS0 reboot=k panic=1 pci=off", + boot_args="console=ttyS0 reboot=k panic=1 swiotlb=noforce", ) test_microvm.add_net_iface() test_microvm.start() diff --git a/tests/integration_tests/functional/test_snapshot_basic.py b/tests/integration_tests/functional/test_snapshot_basic.py index 2b786ea16ae..c4eac866028 100644 --- a/tests/integration_tests/functional/test_snapshot_basic.py +++ b/tests/integration_tests/functional/test_snapshot_basic.py @@ -115,9 +115,8 @@ def test_snapshot_current_version(uvm_nano): def test_cycled_snapshot_restore( bin_vsock_path, tmp_path, + uvm_plain_any, microvm_factory, - guest_kernel, - rootfs, snapshot_type, use_snapshot_editor, cpu_template_any, @@ -132,7 +131,7 @@ def test_cycled_snapshot_restore( logger = logging.getLogger("snapshot_sequence") - vm = microvm_factory.build(guest_kernel, rootfs) + vm = uvm_plain_any vm.spawn() vm.basic_config( vcpu_count=2, @@ -248,7 +247,7 @@ def test_load_snapshot_failure_handling(uvm_plain): vm.mark_killed() -def test_cmp_full_and_first_diff_mem(microvm_factory, guest_kernel, rootfs): +def test_cmp_full_and_first_diff_mem(uvm_plain_any): """ Compare memory of 2 consecutive full and diff snapshots. @@ -259,7 +258,7 @@ def test_cmp_full_and_first_diff_mem(microvm_factory, guest_kernel, rootfs): """ logger = logging.getLogger("snapshot_sequence") - vm = microvm_factory.build(guest_kernel, rootfs) + vm = uvm_plain_any vm.spawn() vm.basic_config( vcpu_count=2, @@ -414,12 +413,12 @@ def test_create_large_diff_snapshot(uvm_plain): # process would have been taken down. -def test_diff_snapshot_overlay(guest_kernel, rootfs, microvm_factory): +def test_diff_snapshot_overlay(uvm_plain_any, microvm_factory): """ Tests that if we take a diff snapshot and direct firecracker to write it on top of an existing snapshot file, it will successfully merge them. """ - basevm = microvm_factory.build(guest_kernel, rootfs) + basevm = uvm_plain_any basevm.spawn() basevm.basic_config(track_dirty_pages=True) basevm.add_net_iface() @@ -451,7 +450,7 @@ def test_diff_snapshot_overlay(guest_kernel, rootfs, microvm_factory): # Check that the restored VM works -def test_snapshot_overwrite_self(guest_kernel, rootfs, microvm_factory): +def test_snapshot_overwrite_self(uvm_plain_any, microvm_factory): """Tests that if we try to take a snapshot that would overwrite the very file from which the current VM is stored, nothing happens. @@ -459,7 +458,7 @@ def test_snapshot_overwrite_self(guest_kernel, rootfs, microvm_factory): of mmap does not specify what should happen if the file is changed after being mmap'd (https://man7.org/linux/man-pages/man2/mmap.2.html). It seems that these changes can propagate to the mmap'd memory region.""" - base_vm = microvm_factory.build(guest_kernel, rootfs) + base_vm = uvm_plain_any base_vm.spawn() base_vm.basic_config() base_vm.add_net_iface() @@ -483,11 +482,11 @@ def test_snapshot_overwrite_self(guest_kernel, rootfs, microvm_factory): # restored, with a new snapshot of this vm, does not break the VM -def test_vmgenid(guest_kernel_linux_6_1, rootfs, microvm_factory, snapshot_type): +def test_vmgenid(uvm_plain_6_1, microvm_factory, snapshot_type): """ Test VMGenID device upon snapshot resume """ - base_vm = microvm_factory.build(guest_kernel_linux_6_1, rootfs) + base_vm = uvm_plain_6_1 base_vm.spawn() base_vm.basic_config(track_dirty_pages=True) base_vm.add_net_iface() diff --git a/tests/integration_tests/functional/test_vsock.py b/tests/integration_tests/functional/test_vsock.py index dfa02510b37..8c0d30700c6 100644 --- a/tests/integration_tests/functional/test_vsock.py +++ b/tests/integration_tests/functional/test_vsock.py @@ -102,11 +102,11 @@ def negative_test_host_connections(vm, blob_path, blob_hash): validate_fc_metrics(metrics) -def test_vsock_epipe(uvm_plain, bin_vsock_path, test_fc_session_root_path): +def test_vsock_epipe(uvm_plain_any, bin_vsock_path, test_fc_session_root_path): """ Vsock negative test to validate SIGPIPE/EPIPE handling. """ - vm = uvm_plain + vm = uvm_plain_any vm.spawn() vm.basic_config() vm.add_net_iface() @@ -129,7 +129,7 @@ def test_vsock_epipe(uvm_plain, bin_vsock_path, test_fc_session_root_path): def test_vsock_transport_reset_h2g( - uvm_nano, microvm_factory, bin_vsock_path, test_fc_session_root_path + uvm_plain_any, microvm_factory, bin_vsock_path, test_fc_session_root_path ): """ Vsock transport reset test. @@ -146,7 +146,9 @@ def test_vsock_transport_reset_h2g( 6. Close VM -> Load VM from Snapshot -> check that vsock device is still working. """ - test_vm = uvm_nano + test_vm = uvm_plain_any + test_vm.spawn() + test_vm.basic_config(vcpu_count=2, mem_size_mib=256) test_vm.add_net_iface() test_vm.api.vsock.put(vsock_id="vsock0", guest_cid=3, uds_path=f"/{VSOCK_UDS_PATH}") test_vm.start() @@ -213,11 +215,13 @@ def test_vsock_transport_reset_h2g( validate_fc_metrics(metrics) -def test_vsock_transport_reset_g2h(uvm_nano, microvm_factory): +def test_vsock_transport_reset_g2h(uvm_plain_any, microvm_factory): """ Vsock transport reset test. """ - test_vm = uvm_nano + test_vm = uvm_plain_any + test_vm.spawn() + test_vm.basic_config(vcpu_count=2, mem_size_mib=256) test_vm.add_net_iface() test_vm.api.vsock.put(vsock_id="vsock0", guest_cid=3, uds_path=f"/{VSOCK_UDS_PATH}") test_vm.start() diff --git a/tests/integration_tests/performance/test_block.py b/tests/integration_tests/performance/test_block.py index dfd0728084a..8882ee0717c 100644 --- a/tests/integration_tests/performance/test_block.py +++ b/tests/integration_tests/performance/test_block.py @@ -161,9 +161,7 @@ def emit_fio_metrics(logs_dir, metrics): @pytest.mark.parametrize("fio_block_size", [4096], ids=["bs4096"]) @pytest.mark.parametrize("fio_engine", ["libaio", "psync"]) def test_block_performance( - microvm_factory, - guest_kernel_acpi, - rootfs, + uvm_plain_acpi, vcpus, fio_mode, fio_block_size, @@ -175,7 +173,7 @@ def test_block_performance( """ Execute block device emulation benchmarking scenarios. """ - vm = microvm_factory.build(guest_kernel_acpi, rootfs, monitor_memory=False) + vm = uvm_plain_acpi vm.spawn(log_level="Info", emit_metrics=True) vm.basic_config(vcpu_count=vcpus, mem_size_mib=GUEST_MEM_MIB) vm.add_net_iface() @@ -213,9 +211,7 @@ def test_block_performance( @pytest.mark.parametrize("fio_mode", ["randread"]) @pytest.mark.parametrize("fio_block_size", [4096], ids=["bs4096"]) def test_block_vhost_user_performance( - microvm_factory, - guest_kernel_acpi, - rootfs, + uvm_plain_acpi, vcpus, fio_mode, fio_block_size, @@ -226,7 +222,7 @@ def test_block_vhost_user_performance( Execute block device emulation benchmarking scenarios. """ - vm = microvm_factory.build(guest_kernel_acpi, rootfs, monitor_memory=False) + vm = uvm_plain_acpi vm.spawn(log_level="Info", emit_metrics=True) vm.basic_config(vcpu_count=vcpus, mem_size_mib=GUEST_MEM_MIB) vm.add_net_iface() diff --git a/tests/integration_tests/performance/test_boottime.py b/tests/integration_tests/performance/test_boottime.py index 77812738b17..d80bf026a39 100644 --- a/tests/integration_tests/performance/test_boottime.py +++ b/tests/integration_tests/performance/test_boottime.py @@ -11,8 +11,8 @@ # Regex for obtaining boot time from some string. DEFAULT_BOOT_ARGS = ( - "reboot=k panic=1 pci=off nomodule 8250.nr_uarts=0" - " i8042.noaux i8042.nomux i8042.nopnp i8042.dumbkbd" + "reboot=k panic=1 nomodule 8250.nr_uarts=0" + " i8042.noaux i8042.nomux i8042.nopnp i8042.dumbkbd swiotlb=noforce" ) @@ -94,31 +94,60 @@ def to_ms(v, unit): return kernel, userspace, total +def launch_vm_with_boot_timer( + microvm_factory, guest_kernel_acpi, rootfs_rw, vcpu_count, mem_size_mib, pci_enabled +): + """Launches a microVM with guest-timer and returns the reported metrics for it""" + vm = microvm_factory.build(guest_kernel_acpi, rootfs_rw, pci=pci_enabled) + vm.jailer.extra_args.update({"boot-timer": None}) + vm.spawn() + vm.basic_config( + vcpu_count=vcpu_count, + mem_size_mib=mem_size_mib, + boot_args=DEFAULT_BOOT_ARGS + " init=/usr/local/bin/init", + enable_entropy_device=True, + ) + vm.add_net_iface() + vm.start() + vm.pin_threads(0) + + boot_time_us, cpu_boot_time_us = get_boottime_device_info(vm) + + return (vm, boot_time_us, cpu_boot_time_us) + + +def test_boot_timer(microvm_factory, guest_kernel_acpi, rootfs, pci_enabled): + """Tests that the boot timer device works""" + launch_vm_with_boot_timer( + microvm_factory, guest_kernel_acpi, rootfs, 1, 128, pci_enabled + ) + + @pytest.mark.parametrize( "vcpu_count,mem_size_mib", [(1, 128), (1, 1024), (2, 2048), (4, 4096)], ) @pytest.mark.nonci def test_boottime( - microvm_factory, guest_kernel_acpi, rootfs_rw, vcpu_count, mem_size_mib, metrics + microvm_factory, + guest_kernel_acpi, + rootfs_rw, + vcpu_count, + mem_size_mib, + pci_enabled, + metrics, ): """Test boot time with different guest configurations""" for i in range(10): - vm = microvm_factory.build(guest_kernel_acpi, rootfs_rw) - vm.jailer.extra_args.update({"boot-timer": None}) - vm.spawn() - vm.basic_config( - vcpu_count=vcpu_count, - mem_size_mib=mem_size_mib, - boot_args=DEFAULT_BOOT_ARGS + " init=/usr/local/bin/init", - enable_entropy_device=True, + vm, boot_time_us, cpu_boot_time_us = launch_vm_with_boot_timer( + microvm_factory, + guest_kernel_acpi, + rootfs_rw, + vcpu_count, + mem_size_mib, + pci_enabled, ) - vm.add_net_iface() - vm.start() - vm.pin_threads(0) - - boot_time_us, cpu_boot_time_us = get_boottime_device_info(vm) if i == 0: metrics.set_dimensions( diff --git a/tests/integration_tests/performance/test_huge_pages.py b/tests/integration_tests/performance/test_huge_pages.py index 04a9264977a..1c5a14873d1 100644 --- a/tests/integration_tests/performance/test_huge_pages.py +++ b/tests/integration_tests/performance/test_huge_pages.py @@ -68,9 +68,7 @@ def test_hugetlbfs_boot(uvm_plain): ) -def test_hugetlbfs_snapshot( - microvm_factory, guest_kernel_linux_5_10, rootfs, snapshot_type -): +def test_hugetlbfs_snapshot(microvm_factory, uvm_plain, snapshot_type): """ Test hugetlbfs snapshot restore via uffd @@ -79,7 +77,7 @@ def test_hugetlbfs_snapshot( """ ### Create Snapshot ### - vm = microvm_factory.build(guest_kernel_linux_5_10, rootfs) + vm = uvm_plain vm.memory_monitor = None vm.spawn() vm.basic_config( @@ -107,8 +105,7 @@ def test_hugetlbfs_snapshot( @pytest.mark.parametrize("huge_pages", HugePagesConfig) def test_ept_violation_count( microvm_factory, - guest_kernel_linux_5_10, - rootfs, + uvm_plain, metrics, huge_pages, ): @@ -118,7 +115,7 @@ def test_ept_violation_count( """ ### Create Snapshot ### - vm = microvm_factory.build(guest_kernel_linux_5_10, rootfs) + vm = uvm_plain vm.memory_monitor = None vm.spawn() vm.basic_config(huge_pages=huge_pages, mem_size_mib=256) diff --git a/tests/integration_tests/performance/test_initrd.py b/tests/integration_tests/performance/test_initrd.py index 3845e5610c0..7b92644efa6 100644 --- a/tests/integration_tests/performance/test_initrd.py +++ b/tests/integration_tests/performance/test_initrd.py @@ -9,13 +9,15 @@ @pytest.fixture -def uvm_with_initrd(microvm_factory, guest_kernel, record_property, artifact_dir): +def uvm_with_initrd( + microvm_factory, guest_kernel, pci_enabled, record_property, artifact_dir +): """ See file:../docs/initrd.md """ fs = artifact_dir / "initramfs.cpio" record_property("rootfs", fs.name) - uvm = microvm_factory.build(guest_kernel) + uvm = microvm_factory.build(guest_kernel, pci=pci_enabled) uvm.initrd_file = fs yield uvm @@ -33,7 +35,7 @@ def test_microvm_initrd_with_serial(uvm_with_initrd, huge_pages): vm.basic_config( add_root_device=False, vcpu_count=1, - boot_args="console=ttyS0 reboot=k panic=1 pci=off", + boot_args="console=ttyS0 reboot=k panic=1 swiotlb=noforce", use_initrd=True, huge_pages=huge_pages, ) diff --git a/tests/integration_tests/performance/test_memory_overhead.py b/tests/integration_tests/performance/test_memory_overhead.py index 7935397cff4..2f4888c95ea 100644 --- a/tests/integration_tests/performance/test_memory_overhead.py +++ b/tests/integration_tests/performance/test_memory_overhead.py @@ -30,7 +30,13 @@ ) @pytest.mark.nonci def test_memory_overhead( - microvm_factory, guest_kernel_acpi, rootfs, vcpu_count, mem_size_mib, metrics + microvm_factory, + guest_kernel_acpi, + rootfs, + vcpu_count, + mem_size_mib, + pci_enabled, + metrics, ): """Track Firecracker memory overhead. @@ -38,7 +44,9 @@ def test_memory_overhead( """ for _ in range(5): - microvm = microvm_factory.build(guest_kernel_acpi, rootfs, monitor_memory=False) + microvm = microvm_factory.build( + guest_kernel_acpi, rootfs, pci=pci_enabled, monitor_memory=False + ) microvm.spawn(emit_metrics=True) microvm.basic_config(vcpu_count=vcpu_count, mem_size_mib=mem_size_mib) microvm.add_net_iface() diff --git a/tests/integration_tests/performance/test_network.py b/tests/integration_tests/performance/test_network.py index 115ed4196b7..74ad26c26a8 100644 --- a/tests/integration_tests/performance/test_network.py +++ b/tests/integration_tests/performance/test_network.py @@ -38,14 +38,14 @@ def consume_ping_output(ping_putput): @pytest.fixture -def network_microvm(request, microvm_factory, guest_kernel_acpi, rootfs): +def network_microvm(request, uvm_plain_acpi): """Creates a microvm with the networking setup used by the performance tests in this file. This fixture receives its vcpu count via indirect parameterization""" guest_mem_mib = 1024 guest_vcpus = request.param - vm = microvm_factory.build(guest_kernel_acpi, rootfs, monitor_memory=False) + vm = uvm_plain_acpi vm.spawn(log_level="Info", emit_metrics=True) vm.basic_config(vcpu_count=guest_vcpus, mem_size_mib=guest_mem_mib) vm.add_net_iface() diff --git a/tests/integration_tests/performance/test_snapshot.py b/tests/integration_tests/performance/test_snapshot.py index 24ca4ab974b..b4e9afabb67 100644 --- a/tests/integration_tests/performance/test_snapshot.py +++ b/tests/integration_tests/performance/test_snapshot.py @@ -44,12 +44,13 @@ def id(self): """Computes a unique id for this test instance""" return "all_dev" if self.all_devices else f"{self.vcpus}vcpu_{self.mem}mb" - def boot_vm(self, microvm_factory, guest_kernel, rootfs) -> Microvm: + def boot_vm(self, microvm_factory, guest_kernel, rootfs, pci_enabled) -> Microvm: """Creates the initial snapshot that will be loaded repeatedly to sample latencies""" vm = microvm_factory.build( guest_kernel, rootfs, monitor_memory=False, + pci=pci_enabled, ) vm.spawn(log_level="Info", emit_metrics=True) vm.time_api_requests = False @@ -96,7 +97,7 @@ def boot_vm(self, microvm_factory, guest_kernel, rootfs) -> Microvm: ids=lambda x: x.id, ) def test_restore_latency( - microvm_factory, rootfs, guest_kernel_linux_5_10, test_setup, metrics + microvm_factory, guest_kernel_linux_5_10, rootfs, pci_enabled, test_setup, metrics ): """ Restores snapshots with vcpu/memory configuration, roughly scaling according to mem = (vcpus - 1) * 2048MB, @@ -105,7 +106,9 @@ def test_restore_latency( We only test a single guest kernel, as the guest kernel does not "participate" in snapshot restore. """ - vm = test_setup.boot_vm(microvm_factory, guest_kernel_linux_5_10, rootfs) + vm = test_setup.boot_vm( + microvm_factory, guest_kernel_linux_5_10, rootfs, pci_enabled + ) metrics.set_dimensions( { @@ -147,6 +150,7 @@ def test_post_restore_latency( microvm_factory, rootfs, guest_kernel_linux_5_10, + pci_enabled, metrics, uffd_handler, huge_pages, @@ -156,7 +160,9 @@ def test_post_restore_latency( pytest.skip("huge page snapshots can only be restored using uffd") test_setup = SnapshotRestoreTest(mem=1024, vcpus=2, huge_pages=huge_pages) - vm = test_setup.boot_vm(microvm_factory, guest_kernel_linux_5_10, rootfs) + vm = test_setup.boot_vm( + microvm_factory, guest_kernel_linux_5_10, rootfs, pci_enabled + ) metrics.set_dimensions( { @@ -204,6 +210,7 @@ def test_population_latency( microvm_factory, rootfs, guest_kernel_linux_5_10, + pci_enabled, metrics, huge_pages, vcpus, @@ -211,7 +218,9 @@ def test_population_latency( ): """Collects population latency metrics (e.g. how long it takes UFFD handler to fault in all memory)""" test_setup = SnapshotRestoreTest(mem=mem, vcpus=vcpus, huge_pages=huge_pages) - vm = test_setup.boot_vm(microvm_factory, guest_kernel_linux_5_10, rootfs) + vm = test_setup.boot_vm( + microvm_factory, guest_kernel_linux_5_10, rootfs, pci_enabled + ) metrics.set_dimensions( { @@ -255,15 +264,13 @@ def test_population_latency( @pytest.mark.nonci def test_snapshot_create_latency( - microvm_factory, - guest_kernel_linux_5_10, - rootfs, + uvm_plain, metrics, snapshot_type, ): """Measure the latency of creating a Full snapshot""" - vm = microvm_factory.build(guest_kernel_linux_5_10, rootfs, monitor_memory=False) + vm = uvm_plain vm.spawn() vm.basic_config( vcpu_count=2, diff --git a/tests/integration_tests/performance/test_vhost_user_metrics.py b/tests/integration_tests/performance/test_vhost_user_metrics.py index fd20b34a47b..a278ae79971 100644 --- a/tests/integration_tests/performance/test_vhost_user_metrics.py +++ b/tests/integration_tests/performance/test_vhost_user_metrics.py @@ -10,9 +10,7 @@ @pytest.mark.parametrize("vcpu_count", [1, 2], ids=["1vcpu", "2vcpu"]) -def test_vhost_user_block_metrics( - microvm_factory, guest_kernel_acpi, rootfs, vcpu_count, metrics -): +def test_vhost_user_block_metrics(uvm_plain_acpi, vcpu_count, metrics): """ This test tries to boot a VM with vhost-user-block as a scratch device, resize the vhost-user scratch drive to have @@ -28,7 +26,7 @@ def test_vhost_user_block_metrics( # low->high->low->high and so the numbers are not in monotonic sequence. new_sizes = [20, 10, 30] # MB - vm = microvm_factory.build(guest_kernel_acpi, rootfs, monitor_memory=False) + vm = uvm_plain_acpi vm.spawn(log_level="Info") vm.basic_config(vcpu_count=vcpu_count) vm.add_net_iface() diff --git a/tests/integration_tests/performance/test_vsock.py b/tests/integration_tests/performance/test_vsock.py index bad4436e568..402e7ff66b5 100644 --- a/tests/integration_tests/performance/test_vsock.py +++ b/tests/integration_tests/performance/test_vsock.py @@ -75,9 +75,7 @@ def guest_command(self, port_offset): @pytest.mark.parametrize("payload_length", ["64K", "1024K"], ids=["p64K", "p1024K"]) @pytest.mark.parametrize("mode", ["g2h", "h2g", "bd"]) def test_vsock_throughput( - microvm_factory, - guest_kernel_acpi, - rootfs, + uvm_plain_acpi, vcpus, payload_length, mode, @@ -94,7 +92,7 @@ def test_vsock_throughput( pytest.skip("bidrectional test only done with at least 2 vcpus") mem_size_mib = 1024 - vm = microvm_factory.build(guest_kernel_acpi, rootfs, monitor_memory=False) + vm = uvm_plain_acpi vm.spawn(log_level="Info", emit_metrics=True) vm.basic_config(vcpu_count=vcpus, mem_size_mib=mem_size_mib) vm.add_net_iface() diff --git a/tests/integration_tests/security/test_vulnerabilities.py b/tests/integration_tests/security/test_vulnerabilities.py index b15af03ab38..01b8e9c595b 100644 --- a/tests/integration_tests/security/test_vulnerabilities.py +++ b/tests/integration_tests/security/test_vulnerabilities.py @@ -222,12 +222,12 @@ def uvm_any_a(microvm_factory_a, uvm_ctor, guest_kernel, rootfs, cpu_template_an Since pytest caches fixtures, this guarantees uvm_any_a will match a vm from uvm_any. See https://docs.pytest.org/en/stable/how-to/fixtures.html#fixtures-can-be-requested-more-than-once-per-test-return-values-are-cached """ - return uvm_ctor(microvm_factory_a, guest_kernel, rootfs, cpu_template_any) + return uvm_ctor(microvm_factory_a, guest_kernel, rootfs, cpu_template_any, False) -def test_check_vulnerability_files_ab(request, uvm_any): +def test_check_vulnerability_files_ab(request, uvm_any_without_pci): """Test vulnerability files on guests""" - res_b = check_vulnerabilities_files_on_guest(uvm_any) + res_b = check_vulnerabilities_files_on_guest(uvm_any_without_pci) if global_props.buildkite_pr: # we only get the uvm_any_a fixtures if we need it uvm_a = request.getfixturevalue("uvm_any_a") @@ -239,11 +239,11 @@ def test_check_vulnerability_files_ab(request, uvm_any): def test_spectre_meltdown_checker_on_guest( request, - uvm_any, + uvm_any_without_pci, spectre_meltdown_checker, ): """Test with the spectre / meltdown checker on any supported guest.""" - res_b = spectre_meltdown_checker.get_report_for_guest(uvm_any) + res_b = spectre_meltdown_checker.get_report_for_guest(uvm_any_without_pci) if global_props.buildkite_pr: # we only get the uvm_any_a fixtures if we need it uvm_a = request.getfixturevalue("uvm_any_a") @@ -251,5 +251,5 @@ def test_spectre_meltdown_checker_on_guest( assert res_b <= res_a else: assert res_b == spectre_meltdown_checker.expected_vulnerabilities( - uvm_any.cpu_template_name + uvm_any_without_pci.cpu_template_name )