From 2561fd4c5210f2951eb5e287fa13c2a840c3e609 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 16 Apr 2025 10:16:02 +0200 Subject: [PATCH 01/99] chore: prepare virtio for multiple transport options This is just code organization changes. Create a new module under `virtio`, called `transport`. For the time being the only transport supported is `mmio`. Also, move `IrqInterrupt` type within the MMIO transport code, as it is MMIO specific. Signed-off-by: Babis Chalios --- src/vmm/src/builder.rs | 2 +- src/vmm/src/device_manager/mmio.rs | 7 +- src/vmm/src/device_manager/persist.rs | 2 +- src/vmm/src/devices/bus.rs | 2 +- src/vmm/src/devices/virtio/balloon/device.rs | 2 +- .../src/devices/virtio/balloon/test_utils.rs | 2 +- src/vmm/src/devices/virtio/block/device.rs | 3 +- .../devices/virtio/block/vhost_user/device.rs | 7 +- .../src/devices/virtio/block/virtio/device.rs | 5 +- .../devices/virtio/block/virtio/persist.rs | 3 +- .../devices/virtio/block/virtio/test_utils.rs | 4 +- src/vmm/src/devices/virtio/device.rs | 87 +------------------ src/vmm/src/devices/virtio/mod.rs | 2 +- src/vmm/src/devices/virtio/net/device.rs | 4 +- src/vmm/src/devices/virtio/net/test_utils.rs | 7 +- src/vmm/src/devices/virtio/persist.rs | 4 +- src/vmm/src/devices/virtio/rng/device.rs | 3 +- .../devices/virtio/{ => transport}/mmio.rs | 86 +++++++++++++++++- src/vmm/src/devices/virtio/transport/mod.rs | 5 ++ src/vmm/src/devices/virtio/vhost_user.rs | 2 +- src/vmm/src/devices/virtio/vsock/device.rs | 5 +- 21 files changed, 130 insertions(+), 114 deletions(-) rename src/vmm/src/devices/virtio/{ => transport}/mmio.rs (92%) create mode 100644 src/vmm/src/devices/virtio/transport/mod.rs diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 74f03e6b111..398a9f2f037 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -44,9 +44,9 @@ use crate::devices::legacy::{EventFdTrigger, SerialEventsWrapper, SerialWrapper} use crate::devices::virtio::balloon::Balloon; use crate::devices::virtio::block::device::Block; use crate::devices::virtio::device::VirtioDevice; -use crate::devices::virtio::mmio::MmioTransport; use crate::devices::virtio::net::Net; use crate::devices::virtio::rng::Entropy; +use crate::devices::virtio::transport::mmio::MmioTransport; use crate::devices::virtio::vsock::{Vsock, VsockUnixBackend}; #[cfg(feature = "gdb")] use crate::gdb; diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 9a7dc775295..18535efa61a 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -29,9 +29,9 @@ use crate::devices::pseudo::BootTimer; use crate::devices::virtio::balloon::Balloon; use crate::devices::virtio::block::device::Block; use crate::devices::virtio::device::VirtioDevice; -use crate::devices::virtio::mmio::MmioTransport; use crate::devices::virtio::net::Net; use crate::devices::virtio::rng::Entropy; +use crate::devices::virtio::transport::mmio::MmioTransport; use crate::devices::virtio::vsock::{TYPE_VSOCK, Vsock, VsockUnixBackend}; use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET, TYPE_RNG}; #[cfg(target_arch = "x86_64")] @@ -222,7 +222,7 @@ impl MMIODeviceManager { device_info: &MMIODeviceInfo, ) -> Result<(), MmioError> { // as per doc, [virtio_mmio.]device=@: needs to be appended - // to kernel command line for virtio mmio devices to get recongnized + // to kernel command line for virtio mmio devices to get recognized // the size parameter has to be transformed to KiB, so dividing hexadecimal value in // bytes to 1024; further, the '{}' formatting rust construct will automatically // transform it to decimal @@ -529,8 +529,9 @@ mod tests { use super::*; use crate::Vm; use crate::devices::virtio::ActivateError; - use crate::devices::virtio::device::{IrqTrigger, VirtioDevice}; + use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::queue::Queue; + use crate::devices::virtio::transport::mmio::IrqTrigger; use crate::test_utils::multi_region_mem_raw; use crate::vstate::kvm::Kvm; use crate::vstate::memory::{GuestAddress, GuestMemoryMmap}; diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index fd24db52c3b..7b9ccf6c7de 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -25,7 +25,6 @@ use crate::devices::virtio::block::BlockError; use crate::devices::virtio::block::device::Block; use crate::devices::virtio::block::persist::{BlockConstructorArgs, BlockState}; use crate::devices::virtio::device::VirtioDevice; -use crate::devices::virtio::mmio::MmioTransport; use crate::devices::virtio::net::Net; use crate::devices::virtio::net::persist::{ NetConstructorArgs, NetPersistError as NetError, NetState, @@ -35,6 +34,7 @@ use crate::devices::virtio::rng::Entropy; use crate::devices::virtio::rng::persist::{ EntropyConstructorArgs, EntropyPersistError as EntropyError, EntropyState, }; +use crate::devices::virtio::transport::mmio::MmioTransport; use crate::devices::virtio::vsock::persist::{ VsockConstructorArgs, VsockState, VsockUdsConstructorArgs, }; diff --git a/src/vmm/src/devices/bus.rs b/src/vmm/src/devices/bus.rs index 2b016d73083..d0e1b296998 100644 --- a/src/vmm/src/devices/bus.rs +++ b/src/vmm/src/devices/bus.rs @@ -56,7 +56,7 @@ use event_manager::{EventOps, Events, MutEventSubscriber}; use super::legacy::RTCDevice; use super::legacy::{I8042Device, SerialDevice}; use super::pseudo::BootTimer; -use super::virtio::mmio::MmioTransport; +use super::virtio::transport::mmio::MmioTransport; #[derive(Debug)] pub enum BusDevice { diff --git a/src/vmm/src/devices/virtio/balloon/device.rs b/src/vmm/src/devices/virtio/balloon/device.rs index 8962c992cf8..302ab832eab 100644 --- a/src/vmm/src/devices/virtio/balloon/device.rs +++ b/src/vmm/src/devices/virtio/balloon/device.rs @@ -23,9 +23,9 @@ use super::{ VIRTIO_BALLOON_S_SWAP_OUT, }; use crate::devices::virtio::balloon::BalloonError; -use crate::devices::virtio::device::{IrqTrigger, IrqType}; use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; use crate::devices::virtio::queue::InvalidAvailIdx; +use crate::devices::virtio::transport::mmio::{IrqTrigger, IrqType}; use crate::logger::IncMetric; use crate::utils::u64_to_usize; use crate::vstate::memory::{Address, ByteValued, Bytes, GuestAddress, GuestMemoryMmap}; diff --git a/src/vmm/src/devices/virtio/balloon/test_utils.rs b/src/vmm/src/devices/virtio/balloon/test_utils.rs index af0d7f5845e..69b0b4f92a0 100644 --- a/src/vmm/src/devices/virtio/balloon/test_utils.rs +++ b/src/vmm/src/devices/virtio/balloon/test_utils.rs @@ -10,7 +10,7 @@ use crate::devices::virtio::{balloon::BALLOON_NUM_QUEUES, balloon::Balloon}; #[cfg(test)] pub fn invoke_handler_for_queue_event(b: &mut Balloon, queue_index: usize) { use crate::devices::virtio::balloon::{DEFLATE_INDEX, INFLATE_INDEX, STATS_INDEX}; - use crate::devices::virtio::device::IrqType; + use crate::devices::virtio::transport::mmio::IrqType; assert!(queue_index < BALLOON_NUM_QUEUES); // Trigger the queue event. diff --git a/src/vmm/src/devices/virtio/block/device.rs b/src/vmm/src/devices/virtio/block/device.rs index 5d41eb04078..4f4676a24a8 100644 --- a/src/vmm/src/devices/virtio/block/device.rs +++ b/src/vmm/src/devices/virtio/block/device.rs @@ -8,8 +8,9 @@ use super::BlockError; use super::persist::{BlockConstructorArgs, BlockState}; use super::vhost_user::device::{VhostUserBlock, VhostUserBlockConfig}; use super::virtio::device::{VirtioBlock, VirtioBlockConfig}; -use crate::devices::virtio::device::{IrqTrigger, VirtioDevice}; +use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::queue::{InvalidAvailIdx, Queue}; +use crate::devices::virtio::transport::mmio::IrqTrigger; use crate::devices::virtio::{ActivateError, TYPE_BLOCK}; use crate::rate_limiter::BucketUpdate; use crate::snapshot::Persist; diff --git a/src/vmm/src/devices/virtio/block/vhost_user/device.rs b/src/vmm/src/devices/virtio/block/vhost_user/device.rs index a42a2fe0c46..87f6264db4c 100644 --- a/src/vmm/src/devices/virtio/block/vhost_user/device.rs +++ b/src/vmm/src/devices/virtio/block/vhost_user/device.rs @@ -14,11 +14,12 @@ use vmm_sys_util::eventfd::EventFd; use super::{NUM_QUEUES, QUEUE_SIZE, VhostUserBlockError}; use crate::devices::virtio::block::CacheType; -use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice}; +use crate::devices::virtio::device::{DeviceState, VirtioDevice}; use crate::devices::virtio::generated::virtio_blk::{VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_RO}; use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; use crate::devices::virtio::generated::virtio_ring::VIRTIO_RING_F_EVENT_IDX; use crate::devices::virtio::queue::Queue; +use crate::devices::virtio::transport::mmio::{IrqTrigger, IrqType}; use crate::devices::virtio::vhost_user::{VhostUserHandleBackend, VhostUserHandleImpl}; use crate::devices::virtio::vhost_user_metrics::{ VhostUserDeviceMetrics, VhostUserMetricsPerDevice, @@ -34,7 +35,7 @@ const BLOCK_CONFIG_SPACE_SIZE: u32 = 60; const AVAILABLE_FEATURES: u64 = (1 << VIRTIO_F_VERSION_1) | (1 << VIRTIO_RING_F_EVENT_IDX) - // vhost-user specific bit. Not defined in standart virtio spec. + // vhost-user specific bit. Not defined in standard virtio spec. // Specifies ability of frontend to negotiate protocol features. | VhostUserVirtioFeatures::PROTOCOL_FEATURES.bits() // We always try to negotiate readonly with the backend. @@ -375,8 +376,8 @@ mod tests { use super::*; use crate::devices::virtio::block::virtio::device::FileEngineType; - use crate::devices::virtio::mmio::VIRTIO_MMIO_INT_CONFIG; use crate::devices::virtio::test_utils::VirtQueue; + use crate::devices::virtio::transport::mmio::VIRTIO_MMIO_INT_CONFIG; use crate::devices::virtio::vhost_user::tests::create_mem; use crate::test_utils::create_tmp_socket; use crate::vstate::memory::GuestAddress; diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs index 2f5d88114b6..f1e978cc096 100644 --- a/src/vmm/src/devices/virtio/block/virtio/device.rs +++ b/src/vmm/src/devices/virtio/block/virtio/device.rs @@ -23,13 +23,14 @@ use super::request::*; use super::{BLOCK_QUEUE_SIZES, SECTOR_SHIFT, SECTOR_SIZE, VirtioBlockError, io as block_io}; use crate::devices::virtio::block::CacheType; use crate::devices::virtio::block::virtio::metrics::{BlockDeviceMetrics, BlockMetricsPerDevice}; -use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice}; +use crate::devices::virtio::device::{DeviceState, VirtioDevice}; use crate::devices::virtio::generated::virtio_blk::{ VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_RO, VIRTIO_BLK_ID_BYTES, }; use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; use crate::devices::virtio::generated::virtio_ring::VIRTIO_RING_F_EVENT_IDX; use crate::devices::virtio::queue::{InvalidAvailIdx, Queue}; +use crate::devices::virtio::transport::mmio::{IrqTrigger, IrqType}; use crate::devices::virtio::{ActivateError, TYPE_BLOCK}; use crate::logger::{IncMetric, error, warn}; use crate::rate_limiter::{BucketUpdate, RateLimiter}; @@ -829,7 +830,7 @@ mod tests { block.read_config(0, actual_config_space.as_mut_slice()); assert_eq!(actual_config_space, expected_config_space); - // If priviledged user writes to `/dev/mem`, in block config space - byte by byte. + // If privileged user writes to `/dev/mem`, in block config space - byte by byte. let expected_config_space = ConfigSpace { capacity: 0x1122334455667788, }; diff --git a/src/vmm/src/devices/virtio/block/virtio/persist.rs b/src/vmm/src/devices/virtio/block/virtio/persist.rs index 8c6f2c2453d..dafad8e91e6 100644 --- a/src/vmm/src/devices/virtio/block/virtio/persist.rs +++ b/src/vmm/src/devices/virtio/block/virtio/persist.rs @@ -16,9 +16,10 @@ use crate::devices::virtio::TYPE_BLOCK; use crate::devices::virtio::block::persist::BlockConstructorArgs; use crate::devices::virtio::block::virtio::device::FileEngineType; use crate::devices::virtio::block::virtio::metrics::BlockMetricsPerDevice; -use crate::devices::virtio::device::{DeviceState, IrqTrigger}; +use crate::devices::virtio::device::DeviceState; use crate::devices::virtio::generated::virtio_blk::VIRTIO_BLK_F_RO; use crate::devices::virtio::persist::VirtioDeviceState; +use crate::devices::virtio::transport::mmio::IrqTrigger; use crate::rate_limiter::RateLimiter; use crate::rate_limiter::persist::RateLimiterState; use crate::snapshot::Persist; diff --git a/src/vmm/src/devices/virtio/block/virtio/test_utils.rs b/src/vmm/src/devices/virtio/block/virtio/test_utils.rs index 02dd34fbce9..b05e899f32d 100644 --- a/src/vmm/src/devices/virtio/block/virtio/test_utils.rs +++ b/src/vmm/src/devices/virtio/block/virtio/test_utils.rs @@ -16,10 +16,10 @@ use crate::devices::virtio::block::virtio::device::FileEngineType; #[cfg(test)] use crate::devices::virtio::block::virtio::io::FileEngine; use crate::devices::virtio::block::virtio::{CacheType, VirtioBlock}; -#[cfg(test)] -use crate::devices::virtio::device::IrqType; use crate::devices::virtio::queue::{Queue, VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; use crate::devices::virtio::test_utils::{VirtQueue, VirtqDesc}; +#[cfg(test)] +use crate::devices::virtio::transport::mmio::IrqType; use crate::rate_limiter::RateLimiter; use crate::vmm_config::{RateLimiterConfig, TokenBucketConfig}; use crate::vstate::memory::{Bytes, GuestAddress}; diff --git a/src/vmm/src/devices/virtio/device.rs b/src/vmm/src/devices/virtio/device.rs index ba1ca6b279e..2afe8f2f485 100644 --- a/src/vmm/src/devices/virtio/device.rs +++ b/src/vmm/src/devices/virtio/device.rs @@ -7,15 +7,15 @@ use std::fmt; use std::sync::Arc; -use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::atomic::AtomicU32; use vmm_sys_util::eventfd::EventFd; use super::ActivateError; -use super::mmio::{VIRTIO_MMIO_INT_CONFIG, VIRTIO_MMIO_INT_VRING}; use super::queue::{Queue, QueueError}; +use super::transport::mmio::IrqTrigger; use crate::devices::virtio::AsAny; -use crate::logger::{error, warn}; +use crate::logger::warn; use crate::vstate::memory::GuestMemoryMmap; /// Enum that indicates if a VirtioDevice is inactive or has been activated @@ -44,46 +44,6 @@ impl DeviceState { } } -/// The 2 types of interrupt sources in MMIO transport. -#[derive(Debug)] -pub enum IrqType { - /// Interrupt triggered by change in config. - Config, - /// Interrupt triggered by used vring buffers. - Vring, -} - -/// Helper struct that is responsible for triggering guest IRQs -#[derive(Debug)] -pub struct IrqTrigger { - pub(crate) irq_status: Arc, - pub(crate) irq_evt: EventFd, -} - -impl IrqTrigger { - pub fn new() -> std::io::Result { - Ok(Self { - irq_status: Arc::new(AtomicU32::new(0)), - irq_evt: EventFd::new(libc::EFD_NONBLOCK)?, - }) - } - - pub fn trigger_irq(&self, irq_type: IrqType) -> Result<(), std::io::Error> { - let irq = match irq_type { - IrqType::Config => VIRTIO_MMIO_INT_CONFIG, - IrqType::Vring => VIRTIO_MMIO_INT_VRING, - }; - self.irq_status.fetch_or(irq, Ordering::SeqCst); - - self.irq_evt.write(1).map_err(|err| { - error!("Failed to send irq to the guest: {:?}", err); - err - })?; - - Ok(()) - } -} - /// Trait for virtio devices to be driven by a virtio transport. /// /// The lifecycle of a virtio device is to be moved to a virtio transport, which will then query the @@ -200,47 +160,6 @@ impl fmt::Debug for dyn VirtioDevice { pub(crate) mod tests { use super::*; - impl IrqTrigger { - pub fn has_pending_irq(&self, irq_type: IrqType) -> bool { - if let Ok(num_irqs) = self.irq_evt.read() { - if num_irqs == 0 { - return false; - } - - let irq_status = self.irq_status.load(Ordering::SeqCst); - return matches!( - (irq_status, irq_type), - (VIRTIO_MMIO_INT_CONFIG, IrqType::Config) - | (VIRTIO_MMIO_INT_VRING, IrqType::Vring) - ); - } - - false - } - } - - #[test] - fn irq_trigger() { - let irq_trigger = IrqTrigger::new().unwrap(); - assert_eq!(irq_trigger.irq_status.load(Ordering::SeqCst), 0); - - // Check that there are no pending irqs. - assert!(!irq_trigger.has_pending_irq(IrqType::Config)); - assert!(!irq_trigger.has_pending_irq(IrqType::Vring)); - - // Check that trigger_irq() correctly generates irqs. - irq_trigger.trigger_irq(IrqType::Config).unwrap(); - assert!(irq_trigger.has_pending_irq(IrqType::Config)); - irq_trigger.irq_status.store(0, Ordering::SeqCst); - irq_trigger.trigger_irq(IrqType::Vring).unwrap(); - assert!(irq_trigger.has_pending_irq(IrqType::Vring)); - - // Check trigger_irq() failure case (irq_evt is full). - irq_trigger.irq_evt.write(u64::MAX - 1).unwrap(); - irq_trigger.trigger_irq(IrqType::Config).unwrap_err(); - irq_trigger.trigger_irq(IrqType::Vring).unwrap_err(); - } - #[derive(Debug)] struct MockVirtioDevice { acked_features: u64, diff --git a/src/vmm/src/devices/virtio/mod.rs b/src/vmm/src/devices/virtio/mod.rs index f298d28e9bd..0ac3b660397 100644 --- a/src/vmm/src/devices/virtio/mod.rs +++ b/src/vmm/src/devices/virtio/mod.rs @@ -18,12 +18,12 @@ pub mod device; pub mod generated; mod iov_deque; pub mod iovec; -pub mod mmio; pub mod net; pub mod persist; pub mod queue; pub mod rng; pub mod test_utils; +pub mod transport; pub mod vhost_user; pub mod vhost_user_metrics; pub mod vsock; diff --git a/src/vmm/src/devices/virtio/net/device.rs b/src/vmm/src/devices/virtio/net/device.rs index 2ce60707271..093c83c354b 100755 --- a/src/vmm/src/devices/virtio/net/device.rs +++ b/src/vmm/src/devices/virtio/net/device.rs @@ -15,7 +15,7 @@ use log::error; use vmm_sys_util::eventfd::EventFd; use super::NET_QUEUE_MAX_SIZE; -use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice}; +use crate::devices::virtio::device::{DeviceState, VirtioDevice}; use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; use crate::devices::virtio::generated::virtio_net::{ VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, @@ -32,6 +32,7 @@ use crate::devices::virtio::net::{ MAX_BUFFER_SIZE, NET_QUEUE_SIZES, NetError, NetQueue, RX_INDEX, TX_INDEX, generated, }; use crate::devices::virtio::queue::{DescriptorChain, InvalidAvailIdx, Queue}; +use crate::devices::virtio::transport::mmio::{IrqTrigger, IrqType}; use crate::devices::virtio::{ActivateError, TYPE_NET}; use crate::devices::{DeviceError, report_net_event_fail}; use crate::dumbo::pdu::arp::ETH_IPV4_FRAME_LEN; @@ -1059,6 +1060,7 @@ pub mod tests { }; use crate::devices::virtio::queue::VIRTQ_DESC_F_WRITE; use crate::devices::virtio::test_utils::VirtQueue; + use crate::devices::virtio::transport::mmio::IrqType; use crate::dumbo::EthernetFrame; use crate::dumbo::pdu::arp::{ETH_IPV4_FRAME_LEN, EthIPv4ArpFrame}; use crate::dumbo::pdu::ethernet::ETHERTYPE_ARP; diff --git a/src/vmm/src/devices/virtio/net/test_utils.rs b/src/vmm/src/devices/virtio/net/test_utils.rs index 2df7891e034..ec52883e979 100644 --- a/src/vmm/src/devices/virtio/net/test_utils.rs +++ b/src/vmm/src/devices/virtio/net/test_utils.rs @@ -103,7 +103,7 @@ impl TapTrafficSimulator { let send_addr_ptr = &mut storage as *mut libc::sockaddr_storage; - // SAFETY: `sock_addr` is a valid pointer and safe to derference. + // SAFETY: `sock_addr` is a valid pointer and safe to dereference. unsafe { let sock_addr: *mut libc::sockaddr_ll = send_addr_ptr.cast::(); (*sock_addr).sll_family = libc::sa_family_t::try_from(libc::AF_PACKET).unwrap(); @@ -222,7 +222,7 @@ pub fn if_index(tap: &Tap) -> i32 { /// Enable the tap interface. pub fn enable(tap: &Tap) { - // Disable IPv6 router advertisment requests + // Disable IPv6 router advertisement requests Command::new("sh") .arg("-c") .arg(format!( @@ -291,7 +291,7 @@ pub mod test { use event_manager::{EventManager, SubscriberId, SubscriberOps}; use crate::check_metric_after_block; - use crate::devices::virtio::device::{IrqType, VirtioDevice}; + use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::net::device::vnet_hdr_len; use crate::devices::virtio::net::generated::ETH_HLEN; use crate::devices::virtio::net::test_utils::{ @@ -300,6 +300,7 @@ pub mod test { use crate::devices::virtio::net::{MAX_BUFFER_SIZE, Net, RX_INDEX, TX_INDEX}; use crate::devices::virtio::queue::{VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; use crate::devices::virtio::test_utils::{VirtQueue, VirtqDesc}; + use crate::devices::virtio::transport::mmio::IrqType; use crate::logger::IncMetric; use crate::vstate::memory::{Address, Bytes, GuestAddress, GuestMemoryMmap}; diff --git a/src/vmm/src/devices/virtio/persist.rs b/src/vmm/src/devices/virtio/persist.rs index 664f6d57efb..1a1eb6dba7d 100644 --- a/src/vmm/src/devices/virtio/persist.rs +++ b/src/vmm/src/devices/virtio/persist.rs @@ -12,8 +12,8 @@ use serde::{Deserialize, Serialize}; use super::queue::{InvalidAvailIdx, QueueError}; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::generated::virtio_ring::VIRTIO_RING_F_EVENT_IDX; -use crate::devices::virtio::mmio::MmioTransport; use crate::devices::virtio::queue::Queue; +use crate::devices::virtio::transport::mmio::MmioTransport; use crate::snapshot::Persist; use crate::vstate::memory::{GuestAddress, GuestMemoryMmap}; @@ -256,10 +256,10 @@ mod tests { use crate::devices::virtio::block::virtio::VirtioBlock; use crate::devices::virtio::block::virtio::device::FileEngineType; use crate::devices::virtio::block::virtio::test_utils::default_block_with_path; - use crate::devices::virtio::mmio::tests::DummyDevice; use crate::devices::virtio::net::Net; use crate::devices::virtio::net::test_utils::default_net; use crate::devices::virtio::test_utils::default_mem; + use crate::devices::virtio::transport::mmio::tests::DummyDevice; use crate::devices::virtio::vsock::{Vsock, VsockUnixBackend}; use crate::snapshot::Snapshot; diff --git a/src/vmm/src/devices/virtio/rng/device.rs b/src/vmm/src/devices/virtio/rng/device.rs index 38308e9b6b7..2ee9834167d 100644 --- a/src/vmm/src/devices/virtio/rng/device.rs +++ b/src/vmm/src/devices/virtio/rng/device.rs @@ -12,11 +12,12 @@ use vmm_sys_util::eventfd::EventFd; use super::metrics::METRICS; use super::{RNG_NUM_QUEUES, RNG_QUEUE}; use crate::devices::DeviceError; -use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice}; +use crate::devices::virtio::device::{DeviceState, VirtioDevice}; use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; use crate::devices::virtio::iov_deque::IovDequeError; use crate::devices::virtio::iovec::IoVecBufferMut; use crate::devices::virtio::queue::{FIRECRACKER_MAX_QUEUE_SIZE, InvalidAvailIdx, Queue}; +use crate::devices::virtio::transport::mmio::{IrqTrigger, IrqType}; use crate::devices::virtio::{ActivateError, TYPE_RNG}; use crate::logger::{IncMetric, debug, error}; use crate::rate_limiter::{RateLimiter, TokenType}; diff --git a/src/vmm/src/devices/virtio/mmio.rs b/src/vmm/src/devices/virtio/transport/mmio.rs similarity index 92% rename from src/vmm/src/devices/virtio/mmio.rs rename to src/vmm/src/devices/virtio/transport/mmio.rs index 4114838bdd3..b6e2b796398 100644 --- a/src/vmm/src/devices/virtio/mmio.rs +++ b/src/vmm/src/devices/virtio/transport/mmio.rs @@ -9,7 +9,9 @@ use std::fmt::Debug; use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Mutex, MutexGuard}; -use crate::devices::virtio::device::{IrqType, VirtioDevice}; +use vmm_sys_util::eventfd::EventFd; + +use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::device_status; use crate::devices::virtio::queue::Queue; use crate::logger::{error, warn}; @@ -356,13 +358,52 @@ impl MmioTransport { } } +/// The 2 types of interrupt sources in MMIO transport. +#[derive(Debug)] +pub enum IrqType { + /// Interrupt triggered by change in config. + Config, + /// Interrupt triggered by used vring buffers. + Vring, +} + +/// Helper struct that is responsible for triggering guest IRQs +#[derive(Debug)] +pub struct IrqTrigger { + pub(crate) irq_status: Arc, + pub(crate) irq_evt: EventFd, +} + +impl IrqTrigger { + pub fn new() -> std::io::Result { + Ok(Self { + irq_status: Arc::new(AtomicU32::new(0)), + irq_evt: EventFd::new(libc::EFD_NONBLOCK)?, + }) + } + + pub fn trigger_irq(&self, irq_type: IrqType) -> Result<(), std::io::Error> { + let irq = match irq_type { + IrqType::Config => VIRTIO_MMIO_INT_CONFIG, + IrqType::Vring => VIRTIO_MMIO_INT_VRING, + }; + self.irq_status.fetch_or(irq, Ordering::SeqCst); + + self.irq_evt.write(1).map_err(|err| { + error!("Failed to send irq to the guest: {:?}", err); + err + })?; + + Ok(()) + } +} + #[cfg(test)] pub(crate) mod tests { use vmm_sys_util::eventfd::EventFd; use super::*; use crate::devices::virtio::ActivateError; - use crate::devices::virtio::device::IrqTrigger; use crate::devices::virtio::device_status::DEVICE_NEEDS_RESET; use crate::test_utils::single_region_mem; use crate::utils::byte_order::{read_le_u32, write_le_u32}; @@ -968,4 +1009,45 @@ pub(crate) mod tests { dummy_dev.ack_features_by_page(0, 8); assert_eq!(dummy_dev.acked_features(), 24); } + + impl IrqTrigger { + pub fn has_pending_irq(&self, irq_type: IrqType) -> bool { + if let Ok(num_irqs) = self.irq_evt.read() { + if num_irqs == 0 { + return false; + } + + let irq_status = self.irq_status.load(Ordering::SeqCst); + return matches!( + (irq_status, irq_type), + (VIRTIO_MMIO_INT_CONFIG, IrqType::Config) + | (VIRTIO_MMIO_INT_VRING, IrqType::Vring) + ); + } + + false + } + } + + #[test] + fn irq_trigger() { + let irq_trigger = IrqTrigger::new().unwrap(); + assert_eq!(irq_trigger.irq_status.load(Ordering::SeqCst), 0); + + // Check that there are no pending irqs. + assert!(!irq_trigger.has_pending_irq(IrqType::Config)); + assert!(!irq_trigger.has_pending_irq(IrqType::Vring)); + + // Check that trigger_irq() correctly generates irqs. + irq_trigger.trigger_irq(IrqType::Config).unwrap(); + assert!(irq_trigger.has_pending_irq(IrqType::Config)); + irq_trigger.irq_status.store(0, Ordering::SeqCst); + irq_trigger.trigger_irq(IrqType::Vring).unwrap(); + assert!(irq_trigger.has_pending_irq(IrqType::Vring)); + + // Check trigger_irq() failure case (irq_evt is full). + irq_trigger.irq_evt.write(u64::MAX - 1).unwrap(); + irq_trigger.trigger_irq(IrqType::Config).unwrap_err(); + irq_trigger.trigger_irq(IrqType::Vring).unwrap_err(); + } } diff --git a/src/vmm/src/devices/virtio/transport/mod.rs b/src/vmm/src/devices/virtio/transport/mod.rs new file mode 100644 index 00000000000..1ff8229a1c8 --- /dev/null +++ b/src/vmm/src/devices/virtio/transport/mod.rs @@ -0,0 +1,5 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +/// MMIO transport for VirtIO devices +pub mod mmio; diff --git a/src/vmm/src/devices/virtio/vhost_user.rs b/src/vmm/src/devices/virtio/vhost_user.rs index 13b0d71b35a..d90ad16b08c 100644 --- a/src/vmm/src/devices/virtio/vhost_user.rs +++ b/src/vmm/src/devices/virtio/vhost_user.rs @@ -13,8 +13,8 @@ use vhost::{Error as VhostError, VhostBackend, VhostUserMemoryRegionInfo, VringC use vm_memory::{Address, Error as MmapError, GuestMemory, GuestMemoryError, GuestMemoryRegion}; use vmm_sys_util::eventfd::EventFd; -use crate::devices::virtio::device::IrqTrigger; use crate::devices::virtio::queue::Queue; +use crate::devices::virtio::transport::mmio::IrqTrigger; use crate::vstate::memory::GuestMemoryMmap; /// vhost-user error. diff --git a/src/vmm/src/devices/virtio/vsock/device.rs b/src/vmm/src/devices/virtio/vsock/device.rs index a4377768322..e0b8477123a 100644 --- a/src/vmm/src/devices/virtio/vsock/device.rs +++ b/src/vmm/src/devices/virtio/vsock/device.rs @@ -6,7 +6,7 @@ // found in the THIRD-PARTY file. //! This is the `VirtioDevice` implementation for our vsock device. It handles the virtio-level -//! device logic: feature negociation, device configuration, and device activation. +//! device logic: feature negotiation, device configuration, and device activation. //! //! We aim to conform to the VirtIO v1.1 spec: //! https://docs.oasis-open.org/virtio/virtio/v1.1/virtio-v1.1.html @@ -30,9 +30,10 @@ use super::defs::uapi; use super::packet::{VSOCK_PKT_HDR_SIZE, VsockPacketRx, VsockPacketTx}; use super::{VsockBackend, defs}; use crate::devices::virtio::ActivateError; -use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice}; +use crate::devices::virtio::device::{DeviceState, VirtioDevice}; use crate::devices::virtio::generated::virtio_config::{VIRTIO_F_IN_ORDER, VIRTIO_F_VERSION_1}; use crate::devices::virtio::queue::{InvalidAvailIdx, Queue as VirtQueue}; +use crate::devices::virtio::transport::mmio::{IrqTrigger, IrqType}; use crate::devices::virtio::vsock::VsockError; use crate::devices::virtio::vsock::metrics::METRICS; use crate::logger::IncMetric; From 21813292a4631e42af93bc855f2ce5a1598fc0cc Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 23 Apr 2025 13:56:00 +0200 Subject: [PATCH 02/99] chore: avoid IrqTrigger::new().unwrap() `IrqTrigger::new()` returns a `Result` because creating an `EventFd` might fail with an `std::io::Error` error. All users of `IrqTrigger` create the object and directly unwrap the error. To avoid unwraps all over the place, change `IrqTrigger::new()` to unwrap a potential error while creating the EventFd internally and just return `Self`. Signed-off-by: Babis Chalios --- src/vmm/src/device_manager/mmio.rs | 2 +- src/vmm/src/devices/virtio/balloon/device.rs | 2 +- .../devices/virtio/block/vhost_user/device.rs | 2 +- .../src/devices/virtio/block/virtio/device.rs | 2 +- .../devices/virtio/block/virtio/persist.rs | 2 +- src/vmm/src/devices/virtio/net/device.rs | 2 +- src/vmm/src/devices/virtio/rng/device.rs | 2 +- src/vmm/src/devices/virtio/transport/mmio.rs | 19 +++++++++++++------ src/vmm/src/devices/virtio/vhost_user.rs | 2 +- src/vmm/src/devices/virtio/vsock/device.rs | 2 +- 10 files changed, 22 insertions(+), 15 deletions(-) diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 18535efa61a..8347765180b 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -584,7 +584,7 @@ mod tests { dummy: 0, queues: QUEUE_SIZES.iter().map(|&s| Queue::new(s)).collect(), queue_evts: [EventFd::new(libc::EFD_NONBLOCK).expect("cannot create eventFD")], - interrupt_trigger: IrqTrigger::new().expect("cannot create eventFD"), + interrupt_trigger: IrqTrigger::new(), } } } diff --git a/src/vmm/src/devices/virtio/balloon/device.rs b/src/vmm/src/devices/virtio/balloon/device.rs index 302ab832eab..3a13bb508ec 100644 --- a/src/vmm/src/devices/virtio/balloon/device.rs +++ b/src/vmm/src/devices/virtio/balloon/device.rs @@ -220,7 +220,7 @@ impl Balloon { }, queue_evts, queues, - irq_trigger: IrqTrigger::new().map_err(BalloonError::EventFd)?, + irq_trigger: IrqTrigger::new(), device_state: DeviceState::Inactive, activate_evt: EventFd::new(libc::EFD_NONBLOCK).map_err(BalloonError::EventFd)?, restored_from_file, diff --git a/src/vmm/src/devices/virtio/block/vhost_user/device.rs b/src/vmm/src/devices/virtio/block/vhost_user/device.rs index 87f6264db4c..264db2fa7f0 100644 --- a/src/vmm/src/devices/virtio/block/vhost_user/device.rs +++ b/src/vmm/src/devices/virtio/block/vhost_user/device.rs @@ -204,7 +204,7 @@ impl VhostUserBlockImpl { let queue_evts = [EventFd::new(libc::EFD_NONBLOCK).map_err(VhostUserBlockError::EventFd)?; u64_to_usize(NUM_QUEUES)]; let device_state = DeviceState::Inactive; - let irq_trigger = IrqTrigger::new().map_err(VhostUserBlockError::IrqTrigger)?; + let irq_trigger = IrqTrigger::new(); // We negotiated features with backend. Now these acked_features // are available for guest driver to choose from. diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs index f1e978cc096..e89443e5bd9 100644 --- a/src/vmm/src/devices/virtio/block/virtio/device.rs +++ b/src/vmm/src/devices/virtio/block/virtio/device.rs @@ -323,7 +323,7 @@ impl VirtioBlock { queues, queue_evts, device_state: DeviceState::Inactive, - irq_trigger: IrqTrigger::new().map_err(VirtioBlockError::IrqTrigger)?, + irq_trigger: IrqTrigger::new(), id: config.drive_id.clone(), partuuid: config.partuuid, diff --git a/src/vmm/src/devices/virtio/block/virtio/persist.rs b/src/vmm/src/devices/virtio/block/virtio/persist.rs index dafad8e91e6..33a33968e53 100644 --- a/src/vmm/src/devices/virtio/block/virtio/persist.rs +++ b/src/vmm/src/devices/virtio/block/virtio/persist.rs @@ -112,7 +112,7 @@ impl Persist<'_> for VirtioBlock { ) .map_err(VirtioBlockError::Persist)?; - let mut irq_trigger = IrqTrigger::new().map_err(VirtioBlockError::IrqTrigger)?; + let mut irq_trigger = IrqTrigger::new(); irq_trigger.irq_status = Arc::new(AtomicU32::new(state.virtio_state.interrupt_status)); let avail_features = state.virtio_state.avail_features; diff --git a/src/vmm/src/devices/virtio/net/device.rs b/src/vmm/src/devices/virtio/net/device.rs index 093c83c354b..55a94636495 100755 --- a/src/vmm/src/devices/virtio/net/device.rs +++ b/src/vmm/src/devices/virtio/net/device.rs @@ -314,7 +314,7 @@ impl Net { tx_rate_limiter, rx_frame_buf: [0u8; MAX_BUFFER_SIZE], tx_frame_headers: [0u8; frame_hdr_len()], - irq_trigger: IrqTrigger::new().map_err(NetError::EventFd)?, + irq_trigger: IrqTrigger::new(), config_space, guest_mac, device_state: DeviceState::Inactive, diff --git a/src/vmm/src/devices/virtio/rng/device.rs b/src/vmm/src/devices/virtio/rng/device.rs index 2ee9834167d..d644161d87e 100644 --- a/src/vmm/src/devices/virtio/rng/device.rs +++ b/src/vmm/src/devices/virtio/rng/device.rs @@ -70,7 +70,7 @@ impl Entropy { let queue_events = (0..RNG_NUM_QUEUES) .map(|_| EventFd::new(libc::EFD_NONBLOCK)) .collect::, io::Error>>()?; - let irq_trigger = IrqTrigger::new()?; + let irq_trigger = IrqTrigger::new(); Ok(Self { avail_features: 1 << VIRTIO_F_VERSION_1, diff --git a/src/vmm/src/devices/virtio/transport/mmio.rs b/src/vmm/src/devices/virtio/transport/mmio.rs index b6e2b796398..a763ff811dc 100644 --- a/src/vmm/src/devices/virtio/transport/mmio.rs +++ b/src/vmm/src/devices/virtio/transport/mmio.rs @@ -374,12 +374,19 @@ pub struct IrqTrigger { pub(crate) irq_evt: EventFd, } +impl Default for IrqTrigger { + fn default() -> Self { + Self::new() + } +} + impl IrqTrigger { - pub fn new() -> std::io::Result { - Ok(Self { + pub fn new() -> Self { + Self { irq_status: Arc::new(AtomicU32::new(0)), - irq_evt: EventFd::new(libc::EFD_NONBLOCK)?, - }) + irq_evt: EventFd::new(libc::EFD_NONBLOCK) + .expect("Could not create EventFd for IrqTrigger"), + } } pub fn trigger_irq(&self, irq_type: IrqType) -> Result<(), std::io::Error> { @@ -427,7 +434,7 @@ pub(crate) mod tests { DummyDevice { acked_features: 0, avail_features: 0, - interrupt_trigger: IrqTrigger::new().unwrap(), + interrupt_trigger: IrqTrigger::new(), queue_evts: vec![ EventFd::new(libc::EFD_NONBLOCK).unwrap(), EventFd::new(libc::EFD_NONBLOCK).unwrap(), @@ -1031,7 +1038,7 @@ pub(crate) mod tests { #[test] fn irq_trigger() { - let irq_trigger = IrqTrigger::new().unwrap(); + let irq_trigger = IrqTrigger::new(); assert_eq!(irq_trigger.irq_status.load(Ordering::SeqCst), 0); // Check that there are no pending irqs. diff --git a/src/vmm/src/devices/virtio/vhost_user.rs b/src/vmm/src/devices/virtio/vhost_user.rs index d90ad16b08c..4f895e5c05e 100644 --- a/src/vmm/src/devices/virtio/vhost_user.rs +++ b/src/vmm/src/devices/virtio/vhost_user.rs @@ -901,7 +901,7 @@ pub(crate) mod tests { queue.initialize(&guest_memory).unwrap(); let event_fd = EventFd::new(0).unwrap(); - let irq_trigger = IrqTrigger::new().unwrap(); + let irq_trigger = IrqTrigger::new(); let queues = [(0, &queue, &event_fd)]; diff --git a/src/vmm/src/devices/virtio/vsock/device.rs b/src/vmm/src/devices/virtio/vsock/device.rs index e0b8477123a..fc51a61532c 100644 --- a/src/vmm/src/devices/virtio/vsock/device.rs +++ b/src/vmm/src/devices/virtio/vsock/device.rs @@ -103,7 +103,7 @@ where backend, avail_features: AVAIL_FEATURES, acked_features: 0, - irq_trigger: IrqTrigger::new().map_err(VsockError::EventFd)?, + irq_trigger: IrqTrigger::new(), activate_evt: EventFd::new(libc::EFD_NONBLOCK).map_err(VsockError::EventFd)?, device_state: DeviceState::Inactive, rx_packet: VsockPacketRx::new()?, From 271b057ea5ca932ef29104119b837175df6f2db4 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Thu, 17 Apr 2025 13:18:43 +0200 Subject: [PATCH 03/99] refactor: set VirtIO interrupt during activation The MMIO transport for VirtIO devices uses an `IrqTrigger` object as the object that models the logic for sending interrupts from the device to the guest. We create one such object for every VirtIO device when creating it. The MMIO device manager associates this object with an IRQ number and registers it with KVM. This commit changes the timing of association of an `IrqTrigger` with a VirtIO-mmio device. It only assigns such an object to the device during its activation. We do this to prepare for supporting a PCI transport for VirtIO devices. The cloud hypervisor implementation for these passes the interrupt objects used by the device during activation, so we make this change to have a uniform way to handle interrupts for both transports. Functionally, nothing changes for MMIO devices, as before activation we don't trigger any interrupts. Signed-off-by: Babis Chalios --- src/vmm/src/builder.rs | 10 +- src/vmm/src/device_manager/mmio.rs | 21 ++- src/vmm/src/device_manager/persist.rs | 24 +++- src/vmm/src/devices/virtio/balloon/device.rs | 64 ++++++--- .../devices/virtio/balloon/event_handler.rs | 9 +- src/vmm/src/devices/virtio/balloon/mod.rs | 2 +- src/vmm/src/devices/virtio/balloon/persist.rs | 24 ++-- .../src/devices/virtio/balloon/test_utils.rs | 4 +- src/vmm/src/devices/virtio/block/device.rs | 16 ++- src/vmm/src/devices/virtio/block/persist.rs | 4 + .../devices/virtio/block/vhost_user/device.rs | 38 +++-- .../src/devices/virtio/block/virtio/device.rs | 134 +++++++++++------- .../virtio/block/virtio/event_handler.rs | 9 +- .../devices/virtio/block/virtio/persist.rs | 31 ++-- .../devices/virtio/block/virtio/test_utils.rs | 12 +- src/vmm/src/devices/virtio/device.rs | 27 +++- src/vmm/src/devices/virtio/net/device.rs | 62 ++++---- src/vmm/src/devices/virtio/net/persist.rs | 20 +-- src/vmm/src/devices/virtio/net/test_utils.rs | 21 ++- src/vmm/src/devices/virtio/persist.rs | 61 +++++--- src/vmm/src/devices/virtio/queue.rs | 2 +- src/vmm/src/devices/virtio/rng/device.rs | 32 ++--- src/vmm/src/devices/virtio/rng/persist.rs | 28 ++-- src/vmm/src/devices/virtio/test_utils.rs | 17 ++- src/vmm/src/devices/virtio/transport/mmio.rs | 82 +++++++---- src/vmm/src/devices/virtio/vsock/device.rs | 34 +++-- .../src/devices/virtio/vsock/event_handler.rs | 26 ++-- src/vmm/src/devices/virtio/vsock/persist.rs | 17 ++- .../src/devices/virtio/vsock/test_utils.rs | 10 +- 29 files changed, 546 insertions(+), 295 deletions(-) diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 398a9f2f037..88e11ba25f9 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -46,7 +46,7 @@ use crate::devices::virtio::block::device::Block; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::net::Net; use crate::devices::virtio::rng::Entropy; -use crate::devices::virtio::transport::mmio::MmioTransport; +use crate::devices::virtio::transport::mmio::{IrqTrigger, MmioTransport}; use crate::devices::virtio::vsock::{Vsock, VsockUnixBackend}; #[cfg(feature = "gdb")] use crate::gdb; @@ -657,8 +657,14 @@ fn attach_virtio_device( ) -> Result<(), MmioError> { event_manager.add_subscriber(device.clone()); + let interrupt = Arc::new(IrqTrigger::new()); // The device mutex mustn't be locked here otherwise it will deadlock. - let device = MmioTransport::new(vmm.vm.guest_memory().clone(), device, is_vhost_user); + let device = MmioTransport::new( + vmm.vm.guest_memory().clone(), + interrupt, + device, + is_vhost_user, + ); vmm.mmio_device_manager .register_mmio_virtio_for_boot( vmm.vm.fd(), diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 8347765180b..ea9e5c6ab37 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -52,6 +52,8 @@ pub enum MmioError { InvalidDeviceType, /// {0} InternalDeviceError(String), + /// Could not create IRQ for MMIO device: {0} + CreateIrq(#[from] std::io::Error), /// Invalid MMIO IRQ configuration. InvalidIrqConfig, /// Failed to register IO event: {0} @@ -204,7 +206,7 @@ impl MMIODeviceManager { vm.register_ioevent(queue_evt, &io_addr, u32::try_from(i).unwrap()) .map_err(MmioError::RegisterIoEvent)?; } - vm.register_irqfd(&locked_device.interrupt_trigger().irq_evt, irq) + vm.register_irqfd(&mmio_device.interrupt.irq_evt, irq) .map_err(MmioError::RegisterIrqFd)?; } @@ -548,7 +550,8 @@ mod tests { cmdline: &mut kernel_cmdline::Cmdline, dev_id: &str, ) -> Result { - let mmio_device = MmioTransport::new(guest_mem, device, false); + let interrupt = Arc::new(IrqTrigger::new()); + let mmio_device = MmioTransport::new(guest_mem, interrupt, device, false); let device_info = self.register_mmio_virtio_for_boot( vm, resource_allocator, @@ -575,7 +578,7 @@ mod tests { dummy: u32, queues: Vec, queue_evts: [EventFd; 1], - interrupt_trigger: IrqTrigger, + interrupt_trigger: Option>, } impl DummyDevice { @@ -584,7 +587,7 @@ mod tests { dummy: 0, queues: QUEUE_SIZES.iter().map(|&s| Queue::new(s)).collect(), queue_evts: [EventFd::new(libc::EFD_NONBLOCK).expect("cannot create eventFD")], - interrupt_trigger: IrqTrigger::new(), + interrupt_trigger: None, } } } @@ -617,7 +620,9 @@ mod tests { } fn interrupt_trigger(&self) -> &IrqTrigger { - &self.interrupt_trigger + self.interrupt_trigger + .as_ref() + .expect("Device is not activated") } fn ack_features_by_page(&mut self, page: u32, value: u32) { @@ -635,7 +640,11 @@ mod tests { let _ = data; } - fn activate(&mut self, _: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate( + &mut self, + _: GuestMemoryMmap, + _: Arc, + ) -> Result<(), ActivateError> { Ok(()) } diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 7b9ccf6c7de..58c7134aa7f 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -34,7 +34,7 @@ use crate::devices::virtio::rng::Entropy; use crate::devices::virtio::rng::persist::{ EntropyConstructorArgs, EntropyPersistError as EntropyError, EntropyState, }; -use crate::devices::virtio::transport::mmio::MmioTransport; +use crate::devices::virtio::transport::mmio::{IrqTrigger, MmioTransport}; use crate::devices::virtio::vsock::persist::{ VsockConstructorArgs, VsockState, VsockUdsConstructorArgs, }; @@ -455,11 +455,13 @@ impl<'a> Persist<'a> for MMIODeviceManager { as_subscriber: Arc>, id: &String, state: &MmioTransportState, + interrupt: Arc, device_info: &MMIODeviceInfo, event_manager: &mut EventManager| -> Result<(), Self::Error> { let restore_args = MmioTransportConstructorArgs { mem: mem.clone(), + interrupt, device, is_vhost_user, }; @@ -494,9 +496,11 @@ impl<'a> Persist<'a> for MMIODeviceManager { }; if let Some(balloon_state) = &state.balloon_device { + let interrupt = Arc::new(IrqTrigger::new()); let device = Arc::new(Mutex::new(Balloon::restore( BalloonConstructorArgs { mem: mem.clone(), + interrupt: interrupt.clone(), restored_from_file: constructor_args.restored_from_file, }, &balloon_state.device_state, @@ -512,14 +516,19 @@ impl<'a> Persist<'a> for MMIODeviceManager { device, &balloon_state.device_id, &balloon_state.transport_state, + interrupt, &balloon_state.device_info, constructor_args.event_manager, )?; } for block_state in &state.block_devices { + let interrupt = Arc::new(IrqTrigger::new()); let device = Arc::new(Mutex::new(Block::restore( - BlockConstructorArgs { mem: mem.clone() }, + BlockConstructorArgs { + mem: mem.clone(), + interrupt: interrupt.clone(), + }, &block_state.device_state, )?)); @@ -533,6 +542,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { device, &block_state.device_id, &block_state.transport_state, + interrupt, &block_state.device_info, constructor_args.event_manager, )?; @@ -548,9 +558,11 @@ impl<'a> Persist<'a> for MMIODeviceManager { } for net_state in &state.net_devices { + let interrupt = Arc::new(IrqTrigger::new()); let device = Arc::new(Mutex::new(Net::restore( NetConstructorArgs { mem: mem.clone(), + interrupt: interrupt.clone(), mmds: constructor_args .vm_resources .mmds @@ -571,6 +583,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { device, &net_state.device_id, &net_state.transport_state, + interrupt, &net_state.device_info, constructor_args.event_manager, )?; @@ -581,9 +594,11 @@ impl<'a> Persist<'a> for MMIODeviceManager { cid: vsock_state.device_state.frontend.cid, }; let backend = VsockUnixBackend::restore(ctor_args, &vsock_state.device_state.backend)?; + let interrupt = Arc::new(IrqTrigger::new()); let device = Arc::new(Mutex::new(Vsock::restore( VsockConstructorArgs { mem: mem.clone(), + interrupt: interrupt.clone(), backend, }, &vsock_state.device_state.frontend, @@ -599,13 +614,15 @@ impl<'a> Persist<'a> for MMIODeviceManager { device, &vsock_state.device_id, &vsock_state.transport_state, + interrupt, &vsock_state.device_info, constructor_args.event_manager, )?; } if let Some(entropy_state) = &state.entropy_device { - let ctor_args = EntropyConstructorArgs::new(mem.clone()); + let interrupt = Arc::new(IrqTrigger::new()); + let ctor_args = EntropyConstructorArgs::new(mem.clone(), interrupt.clone()); let device = Arc::new(Mutex::new(Entropy::restore( ctor_args, @@ -622,6 +639,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { device, &entropy_state.device_id, &entropy_state.transport_state, + interrupt, &entropy_state.device_info, constructor_args.event_manager, )?; diff --git a/src/vmm/src/devices/virtio/balloon/device.rs b/src/vmm/src/devices/virtio/balloon/device.rs index 3a13bb508ec..35824de1e80 100644 --- a/src/vmm/src/devices/virtio/balloon/device.rs +++ b/src/vmm/src/devices/virtio/balloon/device.rs @@ -1,6 +1,7 @@ // Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +use std::sync::Arc; use std::time::Duration; use log::error; @@ -23,6 +24,7 @@ use super::{ VIRTIO_BALLOON_S_SWAP_OUT, }; use crate::devices::virtio::balloon::BalloonError; +use crate::devices::virtio::device::ActiveState; use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; use crate::devices::virtio::queue::InvalidAvailIdx; use crate::devices::virtio::transport::mmio::{IrqTrigger, IrqType}; @@ -162,7 +164,6 @@ pub struct Balloon { pub(crate) queues: Vec, pub(crate) queue_evts: [EventFd; BALLOON_NUM_QUEUES], pub(crate) device_state: DeviceState, - pub(crate) irq_trigger: IrqTrigger, // Implementation specific fields. pub(crate) restored_from_file: bool, @@ -220,7 +221,6 @@ impl Balloon { }, queue_evts, queues, - irq_trigger: IrqTrigger::new(), device_state: DeviceState::Inactive, activate_evt: EventFd::new(libc::EFD_NONBLOCK).map_err(BalloonError::EventFd)?, restored_from_file, @@ -260,7 +260,7 @@ impl Balloon { pub(crate) fn process_inflate(&mut self) -> Result<(), BalloonError> { // This is safe since we checked in the event handler that the device is activated. - let mem = self.device_state.mem().unwrap(); + let mem = &self.device_state.active_state().unwrap().mem; METRICS.inflate_count.inc(); let queue = &mut self.queues[INFLATE_INDEX]; @@ -369,7 +369,7 @@ impl Balloon { pub(crate) fn process_stats_queue(&mut self) -> Result<(), BalloonError> { // This is safe since we checked in the event handler that the device is activated. - let mem = self.device_state.mem().unwrap(); + let mem = &self.device_state.active_state().unwrap().mem; METRICS.stats_updates_count.inc(); while let Some(head) = self.queues[STATS_INDEX].pop()? { @@ -404,10 +404,12 @@ impl Balloon { } pub(crate) fn signal_used_queue(&self) -> Result<(), BalloonError> { - self.irq_trigger.trigger_irq(IrqType::Vring).map_err(|err| { - METRICS.event_fails.inc(); - BalloonError::InterruptError(err) - }) + self.interrupt_trigger() + .trigger_irq(IrqType::Vring) + .map_err(|err| { + METRICS.event_fails.inc(); + BalloonError::InterruptError(err) + }) } /// Process device virtio queue(s). @@ -444,7 +446,7 @@ impl Balloon { pub fn update_size(&mut self, amount_mib: u32) -> Result<(), BalloonError> { if self.is_activated() { self.config_space.num_pages = mib_to_pages(amount_mib)?; - self.irq_trigger + self.interrupt_trigger() .trigger_irq(IrqType::Config) .map_err(BalloonError::InterruptError) } else { @@ -557,7 +559,11 @@ impl VirtioDevice for Balloon { } fn interrupt_trigger(&self) -> &IrqTrigger { - &self.irq_trigger + &self + .device_state + .active_state() + .expect("Device is not activated") + .interrupt } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -584,13 +590,17 @@ impl VirtioDevice for Balloon { dst.copy_from_slice(data); } - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate( + &mut self, + mem: GuestMemoryMmap, + interrupt: Arc, + ) -> Result<(), ActivateError> { for q in self.queues.iter_mut() { q.initialize(&mem) .map_err(ActivateError::QueueMemoryError)?; } - self.device_state = DeviceState::Activated(mem); + self.device_state = DeviceState::Activated(ActiveState { mem, interrupt }); if self.activate_evt.write(1).is_err() { METRICS.activate_fails.inc(); self.device_state = DeviceState::Inactive; @@ -619,7 +629,7 @@ pub(crate) mod tests { check_request_completion, invoke_handler_for_queue_event, set_request, }; use crate::devices::virtio::queue::{VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; - use crate::devices::virtio::test_utils::{VirtQueue, default_mem}; + use crate::devices::virtio::test_utils::{VirtQueue, default_interrupt, default_mem}; use crate::test_utils::single_region_mem; use crate::vstate::memory::GuestAddress; @@ -796,11 +806,12 @@ pub(crate) mod tests { fn test_invalid_request() { let mut balloon = Balloon::new(0, true, 0, false).unwrap(); let mem = default_mem(); + let interrupt = default_interrupt(); // Only initialize the inflate queue to demonstrate invalid request handling. let infq = VirtQueue::new(GuestAddress(0), &mem, 16); balloon.set_queue(INFLATE_INDEX, infq.create_queue()); balloon.set_queue(DEFLATE_INDEX, infq.create_queue()); - balloon.activate(mem.clone()).unwrap(); + balloon.activate(mem.clone(), interrupt).unwrap(); // Fill the second page with non-zero bytes. for i in 0..0x1000 { @@ -856,10 +867,11 @@ pub(crate) mod tests { fn test_inflate() { let mut balloon = Balloon::new(0, true, 0, false).unwrap(); let mem = default_mem(); + let interrupt = default_interrupt(); let infq = VirtQueue::new(GuestAddress(0), &mem, 16); balloon.set_queue(INFLATE_INDEX, infq.create_queue()); balloon.set_queue(DEFLATE_INDEX, infq.create_queue()); - balloon.activate(mem.clone()).unwrap(); + balloon.activate(mem.clone(), interrupt).unwrap(); // Fill the third page with non-zero bytes. for i in 0..0x1000 { @@ -927,10 +939,11 @@ pub(crate) mod tests { fn test_deflate() { let mut balloon = Balloon::new(0, true, 0, false).unwrap(); let mem = default_mem(); + let interrupt = default_interrupt(); let defq = VirtQueue::new(GuestAddress(0), &mem, 16); balloon.set_queue(INFLATE_INDEX, defq.create_queue()); balloon.set_queue(DEFLATE_INDEX, defq.create_queue()); - balloon.activate(mem.clone()).unwrap(); + balloon.activate(mem.clone(), interrupt).unwrap(); let page_addr = 0x10; @@ -976,11 +989,12 @@ pub(crate) mod tests { fn test_stats() { let mut balloon = Balloon::new(0, true, 1, false).unwrap(); let mem = default_mem(); + let interrupt = default_interrupt(); let statsq = VirtQueue::new(GuestAddress(0), &mem, 16); balloon.set_queue(INFLATE_INDEX, statsq.create_queue()); balloon.set_queue(DEFLATE_INDEX, statsq.create_queue()); balloon.set_queue(STATS_INDEX, statsq.create_queue()); - balloon.activate(mem.clone()).unwrap(); + balloon.activate(mem.clone(), interrupt).unwrap(); let page_addr = 0x100; @@ -1056,7 +1070,7 @@ pub(crate) mod tests { assert!(balloon.stats_desc_index.is_some()); balloon.process_stats_timer_event().unwrap(); assert!(balloon.stats_desc_index.is_none()); - assert!(balloon.irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(balloon.interrupt_trigger().has_pending_irq(IrqType::Vring)); }); } } @@ -1065,13 +1079,14 @@ pub(crate) mod tests { fn test_process_balloon_queues() { let mut balloon = Balloon::new(0x10, true, 0, false).unwrap(); let mem = default_mem(); + let interrupt = default_interrupt(); let infq = VirtQueue::new(GuestAddress(0), &mem, 16); let defq = VirtQueue::new(GuestAddress(0), &mem, 16); balloon.set_queue(INFLATE_INDEX, infq.create_queue()); balloon.set_queue(DEFLATE_INDEX, defq.create_queue()); - balloon.activate(mem).unwrap(); + balloon.activate(mem, interrupt).unwrap(); balloon.process_virtio_queues().unwrap(); } @@ -1082,7 +1097,8 @@ pub(crate) mod tests { let q = VirtQueue::new(GuestAddress(0), &mem, 16); balloon.set_queue(INFLATE_INDEX, q.create_queue()); balloon.set_queue(DEFLATE_INDEX, q.create_queue()); - balloon.activate(mem).unwrap(); + let interrupt = default_interrupt(); + balloon.activate(mem, interrupt).unwrap(); assert_eq!( format!("{:?}", balloon.update_stats_polling_interval(1)), "Err(StatisticsStateChange)" @@ -1095,7 +1111,8 @@ pub(crate) mod tests { balloon.set_queue(INFLATE_INDEX, q.create_queue()); balloon.set_queue(DEFLATE_INDEX, q.create_queue()); balloon.set_queue(STATS_INDEX, q.create_queue()); - balloon.activate(mem).unwrap(); + let interrupt = default_interrupt(); + balloon.activate(mem, interrupt).unwrap(); assert_eq!( format!("{:?}", balloon.update_stats_polling_interval(0)), "Err(StatisticsStateChange)" @@ -1115,7 +1132,10 @@ pub(crate) mod tests { fn test_num_pages() { let mut balloon = Balloon::new(0, true, 0, false).unwrap(); // Switch the state to active. - balloon.device_state = DeviceState::Activated(single_region_mem(0x1)); + balloon.device_state = DeviceState::Activated(ActiveState { + mem: single_region_mem(0x1), + interrupt: default_interrupt(), + }); assert_eq!(balloon.num_pages(), 0); assert_eq!(balloon.actual_pages(), 0); diff --git a/src/vmm/src/devices/virtio/balloon/event_handler.rs b/src/vmm/src/devices/virtio/balloon/event_handler.rs index 4e311edc045..3922b4b8385 100644 --- a/src/vmm/src/devices/virtio/balloon/event_handler.rs +++ b/src/vmm/src/devices/virtio/balloon/event_handler.rs @@ -136,7 +136,7 @@ pub mod tests { use super::*; use crate::devices::virtio::balloon::test_utils::set_request; - use crate::devices::virtio::test_utils::{VirtQueue, default_mem}; + use crate::devices::virtio::test_utils::{VirtQueue, default_interrupt, default_mem}; use crate::vstate::memory::GuestAddress; #[test] @@ -144,6 +144,7 @@ pub mod tests { let mut event_manager = EventManager::new().unwrap(); let mut balloon = Balloon::new(0, true, 10, false).unwrap(); let mem = default_mem(); + let interrupt = default_interrupt(); let infq = VirtQueue::new(GuestAddress(0), &mem, 16); balloon.set_queue(INFLATE_INDEX, infq.create_queue()); balloon.set_queue(DEFLATE_INDEX, infq.create_queue()); @@ -179,7 +180,11 @@ pub mod tests { } // Now activate the device. - balloon.lock().unwrap().activate(mem.clone()).unwrap(); + balloon + .lock() + .unwrap() + .activate(mem.clone(), interrupt) + .unwrap(); // Process the activate event. let ev_count = event_manager.run_with_timeout(50).unwrap(); assert_eq!(ev_count, 1); diff --git a/src/vmm/src/devices/virtio/balloon/mod.rs b/src/vmm/src/devices/virtio/balloon/mod.rs index 5af1e17288a..3f3e9346545 100644 --- a/src/vmm/src/devices/virtio/balloon/mod.rs +++ b/src/vmm/src/devices/virtio/balloon/mod.rs @@ -81,7 +81,7 @@ pub enum BalloonError { MalformedPayload, /// Error restoring the balloon device queues. QueueRestoreError, - /// Received stats querry when stats are disabled. + /// Received stats query when stats are disabled. StatisticsDisabled, /// Statistics cannot be enabled/disabled after activation. StatisticsStateChange, diff --git a/src/vmm/src/devices/virtio/balloon/persist.rs b/src/vmm/src/devices/virtio/balloon/persist.rs index 004fa27f8ca..397dd8aeb3e 100644 --- a/src/vmm/src/devices/virtio/balloon/persist.rs +++ b/src/vmm/src/devices/virtio/balloon/persist.rs @@ -4,7 +4,6 @@ //! Defines the structures needed for saving/restoring balloon devices. use std::sync::Arc; -use std::sync::atomic::AtomicU32; use std::time::Duration; use serde::{Deserialize, Serialize}; @@ -13,9 +12,10 @@ use timerfd::{SetTimeFlags, TimerState}; use super::*; use crate::devices::virtio::TYPE_BALLOON; use crate::devices::virtio::balloon::device::{BalloonStats, ConfigSpace}; -use crate::devices::virtio::device::DeviceState; +use crate::devices::virtio::device::{ActiveState, DeviceState}; use crate::devices::virtio::persist::VirtioDeviceState; use crate::devices::virtio::queue::FIRECRACKER_MAX_QUEUE_SIZE; +use crate::devices::virtio::transport::mmio::IrqTrigger; use crate::snapshot::Persist; use crate::vstate::memory::GuestMemoryMmap; @@ -95,6 +95,8 @@ pub struct BalloonState { pub struct BalloonConstructorArgs { /// Pointer to guest memory. pub mem: GuestMemoryMmap, + /// Interrupt used from the device. + pub interrupt: Arc, pub restored_from_file: bool, } @@ -144,8 +146,6 @@ impl Persist<'_> for Balloon { FIRECRACKER_MAX_QUEUE_SIZE, ) .map_err(|_| Self::Error::QueueRestoreError)?; - balloon.irq_trigger.irq_status = - Arc::new(AtomicU32::new(state.virtio_state.interrupt_status)); balloon.avail_features = state.virtio_state.avail_features; balloon.acked_features = state.virtio_state.acked_features; balloon.latest_stats = state.latest_stats.create_stats(); @@ -155,7 +155,10 @@ impl Persist<'_> for Balloon { }; if state.virtio_state.activated { - balloon.device_state = DeviceState::Activated(constructor_args.mem); + balloon.device_state = DeviceState::Activated(ActiveState { + mem: constructor_args.mem, + interrupt: constructor_args.interrupt, + }); if balloon.stats_enabled() { // Restore the stats descriptor. @@ -178,12 +181,11 @@ impl Persist<'_> for Balloon { #[cfg(test)] mod tests { - use std::sync::atomic::Ordering; use super::*; use crate::devices::virtio::TYPE_BALLOON; use crate::devices::virtio::device::VirtioDevice; - use crate::devices::virtio::test_utils::default_mem; + use crate::devices::virtio::test_utils::{default_interrupt, default_mem}; use crate::snapshot::Snapshot; #[test] @@ -200,6 +202,7 @@ mod tests { let restored_balloon = Balloon::restore( BalloonConstructorArgs { mem: guest_mem, + interrupt: default_interrupt(), restored_from_file: true, }, &Snapshot::deserialize(&mut mem.as_slice()).unwrap(), @@ -213,11 +216,8 @@ mod tests { assert_eq!(restored_balloon.avail_features, balloon.avail_features); assert_eq!(restored_balloon.config_space, balloon.config_space); assert_eq!(restored_balloon.queues(), balloon.queues()); - assert_eq!( - restored_balloon.interrupt_status().load(Ordering::Relaxed), - balloon.interrupt_status().load(Ordering::Relaxed) - ); - assert_eq!(restored_balloon.is_activated(), balloon.is_activated()); + assert!(!restored_balloon.is_activated()); + assert!(!balloon.is_activated()); assert_eq!( restored_balloon.stats_polling_interval_s, diff --git a/src/vmm/src/devices/virtio/balloon/test_utils.rs b/src/vmm/src/devices/virtio/balloon/test_utils.rs index 69b0b4f92a0..e588abaedee 100644 --- a/src/vmm/src/devices/virtio/balloon/test_utils.rs +++ b/src/vmm/src/devices/virtio/balloon/test_utils.rs @@ -10,6 +10,7 @@ use crate::devices::virtio::{balloon::BALLOON_NUM_QUEUES, balloon::Balloon}; #[cfg(test)] pub fn invoke_handler_for_queue_event(b: &mut Balloon, queue_index: usize) { use crate::devices::virtio::balloon::{DEFLATE_INDEX, INFLATE_INDEX, STATS_INDEX}; + use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::transport::mmio::IrqType; assert!(queue_index < BALLOON_NUM_QUEUES); @@ -23,7 +24,8 @@ pub fn invoke_handler_for_queue_event(b: &mut Balloon, queue_index: usize) { _ => unreachable!(), }; // Validate the queue operation finished successfully. - assert!(b.irq_trigger.has_pending_irq(IrqType::Vring)); + let interrupt = b.interrupt_trigger(); + assert!(interrupt.has_pending_irq(IrqType::Vring)); } pub fn set_request(queue: &VirtQueue, idx: u16, addr: u64, len: u32, flags: u16) { diff --git a/src/vmm/src/devices/virtio/block/device.rs b/src/vmm/src/devices/virtio/block/device.rs index 4f4676a24a8..5a491c537c5 100644 --- a/src/vmm/src/devices/virtio/block/device.rs +++ b/src/vmm/src/devices/virtio/block/device.rs @@ -1,6 +1,8 @@ // Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +use std::sync::Arc; + use event_manager::{EventOps, Events, MutEventSubscriber}; use vmm_sys_util::eventfd::EventFd; @@ -176,8 +178,8 @@ impl VirtioDevice for Block { fn interrupt_trigger(&self) -> &IrqTrigger { match self { - Self::Virtio(b) => &b.irq_trigger, - Self::VhostUser(b) => &b.irq_trigger, + Self::Virtio(b) => b.interrupt_trigger(), + Self::VhostUser(b) => b.interrupt_trigger(), } } @@ -195,10 +197,14 @@ impl VirtioDevice for Block { } } - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate( + &mut self, + mem: GuestMemoryMmap, + interrupt: Arc, + ) -> Result<(), ActivateError> { match self { - Self::Virtio(b) => b.activate(mem), - Self::VhostUser(b) => b.activate(mem), + Self::Virtio(b) => b.activate(mem, interrupt), + Self::VhostUser(b) => b.activate(mem, interrupt), } } diff --git a/src/vmm/src/devices/virtio/block/persist.rs b/src/vmm/src/devices/virtio/block/persist.rs index 2d83c416d9f..e7ae1768cca 100644 --- a/src/vmm/src/devices/virtio/block/persist.rs +++ b/src/vmm/src/devices/virtio/block/persist.rs @@ -1,10 +1,13 @@ // Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +use std::sync::Arc; + use serde::{Deserialize, Serialize}; use super::vhost_user::persist::VhostUserBlockState; use super::virtio::persist::VirtioBlockState; +use crate::devices::virtio::transport::mmio::IrqTrigger; use crate::vstate::memory::GuestMemoryMmap; /// Block device state. @@ -18,4 +21,5 @@ pub enum BlockState { #[derive(Debug)] pub struct BlockConstructorArgs { pub mem: GuestMemoryMmap, + pub interrupt: Arc, } diff --git a/src/vmm/src/devices/virtio/block/vhost_user/device.rs b/src/vmm/src/devices/virtio/block/vhost_user/device.rs index 264db2fa7f0..22429996d5f 100644 --- a/src/vmm/src/devices/virtio/block/vhost_user/device.rs +++ b/src/vmm/src/devices/virtio/block/vhost_user/device.rs @@ -14,6 +14,7 @@ use vmm_sys_util::eventfd::EventFd; use super::{NUM_QUEUES, QUEUE_SIZE, VhostUserBlockError}; use crate::devices::virtio::block::CacheType; +use crate::devices::virtio::device::ActiveState; use crate::devices::virtio::device::{DeviceState, VirtioDevice}; use crate::devices::virtio::generated::virtio_blk::{VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_RO}; use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; @@ -118,7 +119,6 @@ pub struct VhostUserBlockImpl { pub queues: Vec, pub queue_evts: [EventFd; u64_to_usize(NUM_QUEUES)], pub device_state: DeviceState, - pub irq_trigger: IrqTrigger, // Implementation specific fields. pub id: String, @@ -144,7 +144,6 @@ impl std::fmt::Debug for VhostUserBlockImpl { .field("queues", &self.queues) .field("queue_evts", &self.queue_evts) .field("device_state", &self.device_state) - .field("irq_trigger", &self.irq_trigger) .field("id", &self.id) .field("partuuid", &self.partuuid) .field("cache_type", &self.cache_type) @@ -204,7 +203,6 @@ impl VhostUserBlockImpl { let queue_evts = [EventFd::new(libc::EFD_NONBLOCK).map_err(VhostUserBlockError::EventFd)?; u64_to_usize(NUM_QUEUES)]; let device_state = DeviceState::Inactive; - let irq_trigger = IrqTrigger::new(); // We negotiated features with backend. Now these acked_features // are available for guest driver to choose from. @@ -226,7 +224,6 @@ impl VhostUserBlockImpl { queues, queue_evts, device_state, - irq_trigger, id: config.drive_id, partuuid: config.partuuid, @@ -257,6 +254,12 @@ impl VhostUserBlockImpl { pub fn config_update(&mut self) -> Result<(), VhostUserBlockError> { let start_time = get_time_us(ClockType::Monotonic); + let interrupt = self + .device_state + .active_state() + .expect("Device is not initialized") + .interrupt + .clone(); // This buffer is used for config size check in vhost crate. let buffer = [0u8; BLOCK_CONFIG_SPACE_SIZE as usize]; @@ -271,7 +274,7 @@ impl VhostUserBlockImpl { ) .map_err(VhostUserBlockError::Vhost)?; self.config_space = new_config_space; - self.irq_trigger + interrupt .trigger_irq(IrqType::Config) .map_err(VhostUserBlockError::IrqTrigger)?; @@ -312,7 +315,11 @@ impl VirtioDevice for VhostUserBlock } fn interrupt_trigger(&self) -> &IrqTrigger { - &self.irq_trigger + &self + .device_state + .active_state() + .expect("Device is not initialized") + .interrupt } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -331,7 +338,11 @@ impl VirtioDevice for VhostUserBlock // Other block config fields are immutable. } - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate( + &mut self, + mem: GuestMemoryMmap, + interrupt: Arc, + ) -> Result<(), ActivateError> { for q in self.queues.iter_mut() { q.initialize(&mem) .map_err(ActivateError::QueueMemoryError)?; @@ -346,14 +357,14 @@ impl VirtioDevice for VhostUserBlock self.vu_handle.setup_backend( &mem, &[(0, &self.queues[0], &self.queue_evts[0])], - &self.irq_trigger, + &interrupt, ) }) .map_err(|err| { self.metrics.activate_fails.inc(); ActivateError::VhostUser(err) })?; - self.device_state = DeviceState::Activated(mem); + self.device_state = DeviceState::Activated(ActiveState { mem, interrupt }); let delta_us = get_time_us(ClockType::Monotonic) - start_time; self.metrics.activate_time_us.store(delta_us); Ok(()) @@ -376,7 +387,7 @@ mod tests { use super::*; use crate::devices::virtio::block::virtio::device::FileEngineType; - use crate::devices::virtio::test_utils::VirtQueue; + use crate::devices::virtio::test_utils::{VirtQueue, default_interrupt, default_mem}; use crate::devices::virtio::transport::mmio::VIRTIO_MMIO_INT_CONFIG; use crate::devices::virtio::vhost_user::tests::create_mem; use crate::test_utils::create_tmp_socket; @@ -653,6 +664,10 @@ mod tests { assert_eq!(vhost_block.config_space, vec![0x69, 0x69, 0x69]); // Testing [`config_update`] + vhost_block.device_state = DeviceState::Activated(ActiveState { + mem: default_mem(), + interrupt: default_interrupt(), + }); vhost_block.config_space = vec![]; vhost_block.config_update().unwrap(); assert_eq!(vhost_block.config_space, vec![0x69, 0x69, 0x69]); @@ -784,9 +799,10 @@ mod tests { let guest_memory = create_mem(file, ®ions); let q = VirtQueue::new(GuestAddress(0), &guest_memory, 16); vhost_block.queues[0] = q.create_queue(); + let interrupt = default_interrupt(); // During actiavion of the device features, memory and queues should be set and activated. - vhost_block.activate(guest_memory).unwrap(); + vhost_block.activate(guest_memory, interrupt).unwrap(); assert!(unsafe { *vhost_block.vu_handle.vu.features_are_set.get() }); assert!(unsafe { *vhost_block.vu_handle.vu.memory_is_set.get() }); assert!(unsafe { *vhost_block.vu_handle.vu.vring_enabled.get() }); diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs index e89443e5bd9..413410f2af6 100644 --- a/src/vmm/src/devices/virtio/block/virtio/device.rs +++ b/src/vmm/src/devices/virtio/block/virtio/device.rs @@ -23,7 +23,7 @@ use super::request::*; use super::{BLOCK_QUEUE_SIZES, SECTOR_SHIFT, SECTOR_SIZE, VirtioBlockError, io as block_io}; use crate::devices::virtio::block::CacheType; use crate::devices::virtio::block::virtio::metrics::{BlockDeviceMetrics, BlockMetricsPerDevice}; -use crate::devices::virtio::device::{DeviceState, VirtioDevice}; +use crate::devices::virtio::device::{ActiveState, DeviceState, VirtioDevice}; use crate::devices::virtio::generated::virtio_blk::{ VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_RO, VIRTIO_BLK_ID_BYTES, }; @@ -250,7 +250,6 @@ pub struct VirtioBlock { pub queues: Vec, pub queue_evts: [EventFd; 1], pub device_state: DeviceState, - pub irq_trigger: IrqTrigger, // Implementation specific fields. pub id: String, @@ -323,7 +322,6 @@ impl VirtioBlock { queues, queue_evts, device_state: DeviceState::Inactive, - irq_trigger: IrqTrigger::new(), id: config.drive_id.clone(), partuuid: config.partuuid, @@ -388,34 +386,40 @@ impl VirtioBlock { /// Device specific function for peaking inside a queue and processing descriptors. pub fn process_queue(&mut self, queue_index: usize) -> Result<(), InvalidAvailIdx> { // This is safe since we checked in the event handler that the device is activated. - let mem = self.device_state.mem().unwrap(); + let active_state = self.device_state.active_state().unwrap(); let queue = &mut self.queues[queue_index]; let mut used_any = false; while let Some(head) = queue.pop_or_enable_notification()? { self.metrics.remaining_reqs_count.add(queue.len().into()); - let processing_result = match Request::parse(&head, mem, self.disk.nsectors) { - Ok(request) => { - if request.rate_limit(&mut self.rate_limiter) { - // Stop processing the queue and return this descriptor chain to the - // avail ring, for later processing. - queue.undo_pop(); - self.metrics.rate_limiter_throttled_events.inc(); - break; + let processing_result = + match Request::parse(&head, &active_state.mem, self.disk.nsectors) { + Ok(request) => { + if request.rate_limit(&mut self.rate_limiter) { + // Stop processing the queue and return this descriptor chain to the + // avail ring, for later processing. + queue.undo_pop(); + self.metrics.rate_limiter_throttled_events.inc(); + break; + } + + request.process( + &mut self.disk, + head.index, + &active_state.mem, + &self.metrics, + ) } - - request.process(&mut self.disk, head.index, mem, &self.metrics) - } - Err(err) => { - error!("Failed to parse available descriptor chain: {:?}", err); - self.metrics.execute_fails.inc(); - ProcessingResult::Executed(FinishedRequest { - num_bytes_to_mem: 0, - desc_idx: head.index, - }) - } - }; + Err(err) => { + error!("Failed to parse available descriptor chain: {:?}", err); + self.metrics.execute_fails.inc(); + ProcessingResult::Executed(FinishedRequest { + num_bytes_to_mem: 0, + desc_idx: head.index, + }) + } + }; match processing_result { ProcessingResult::Submitted => {} @@ -440,7 +444,8 @@ impl VirtioBlock { queue.advance_used_ring_idx(); if used_any && queue.prepare_kick() { - self.irq_trigger + active_state + .interrupt .trigger_irq(IrqType::Vring) .unwrap_or_else(|_| { self.metrics.event_fails.inc(); @@ -464,11 +469,11 @@ impl VirtioBlock { let engine = unwrap_async_file_engine_or_return!(&mut self.disk.file_engine); // This is safe since we checked in the event handler that the device is activated. - let mem = self.device_state.mem().unwrap(); + let active_state = self.device_state.active_state().unwrap(); let queue = &mut self.queues[0]; loop { - match engine.pop(mem) { + match engine.pop(&active_state.mem) { Err(error) => { error!("Failed to read completed io_uring entry: {:?}", error); break; @@ -487,7 +492,7 @@ impl VirtioBlock { ))), ), }; - let finished = pending.finish(mem, res, &self.metrics); + let finished = pending.finish(&active_state.mem, res, &self.metrics); queue .add_used(finished.desc_idx, finished.num_bytes_to_mem) .unwrap_or_else(|err| { @@ -502,7 +507,8 @@ impl VirtioBlock { queue.advance_used_ring_idx(); if queue.prepare_kick() { - self.irq_trigger + active_state + .interrupt .trigger_irq(IrqType::Vring) .unwrap_or_else(|_| { self.metrics.event_fails.inc(); @@ -531,7 +537,9 @@ impl VirtioBlock { self.config_space.capacity = self.disk.nsectors.to_le(); // virtio_block_config_space(); // Kick the driver to pick up the changes. - self.irq_trigger.trigger_irq(IrqType::Config).unwrap(); + self.interrupt_trigger() + .trigger_irq(IrqType::Config) + .unwrap(); self.metrics.update_count.inc(); Ok(()) @@ -599,7 +607,11 @@ impl VirtioDevice for VirtioBlock { } fn interrupt_trigger(&self) -> &IrqTrigger { - &self.irq_trigger + &self + .device_state + .active_state() + .expect("Device is not initialized") + .interrupt } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -628,7 +640,11 @@ impl VirtioDevice for VirtioBlock { dst.copy_from_slice(data); } - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate( + &mut self, + mem: GuestMemoryMmap, + interrupt: Arc, + ) -> Result<(), ActivateError> { for q in self.queues.iter_mut() { q.initialize(&mem) .map_err(ActivateError::QueueMemoryError)?; @@ -645,7 +661,7 @@ impl VirtioDevice for VirtioBlock { self.metrics.activate_fails.inc(); return Err(ActivateError::EventFd); } - self.device_state = DeviceState::Activated(mem); + self.device_state = DeviceState::Activated(ActiveState { mem, interrupt }); Ok(()) } @@ -688,7 +704,7 @@ mod tests { simulate_queue_event, }; use crate::devices::virtio::queue::{VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; - use crate::devices::virtio::test_utils::{VirtQueue, default_mem}; + use crate::devices::virtio::test_utils::{VirtQueue, default_interrupt, default_mem}; use crate::rate_limiter::TokenType; use crate::vstate::memory::{Address, Bytes, GuestAddress}; @@ -863,9 +879,10 @@ mod tests { for engine in [FileEngineType::Sync, FileEngineType::Async] { let mut block = default_block(engine); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -891,9 +908,10 @@ mod tests { let mut block = default_block(engine); // Default mem size is 0x10000 let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -954,9 +972,10 @@ mod tests { for engine in [FileEngineType::Sync, FileEngineType::Async] { let mut block = default_block(engine); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -1005,9 +1024,10 @@ mod tests { for engine in [FileEngineType::Sync, FileEngineType::Async] { let mut block = default_block(engine); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -1037,9 +1057,10 @@ mod tests { for engine in [FileEngineType::Sync, FileEngineType::Async] { let mut block = default_block(engine); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); read_blk_req_descriptors(&vq); vq.dtable[1].set(0xf000, 0x1000, VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE, 2); @@ -1073,9 +1094,10 @@ mod tests { for engine in [FileEngineType::Sync, FileEngineType::Async] { let mut block = default_block(engine); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -1120,9 +1142,10 @@ mod tests { // Default mem size is 0x10000 let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -1359,9 +1382,10 @@ mod tests { { // Default mem size is 0x10000 let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); read_blk_req_descriptors(&vq); vq.dtable[1].set(0xff00, 0x1000, VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE, 2); @@ -1400,9 +1424,10 @@ mod tests { for engine in [FileEngineType::Sync, FileEngineType::Async] { let mut block = default_block(engine); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -1446,9 +1471,10 @@ mod tests { for engine in [FileEngineType::Sync, FileEngineType::Async] { let mut block = default_block(engine); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -1570,9 +1596,10 @@ mod tests { let mut block = default_block(FileEngineType::Async); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, IO_URING_NUM_ENTRIES * 4); block.queues[0] = vq.create_queue(); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); // Run scenario that doesn't trigger FullSq BlockError: Add sq_size flush requests. add_flush_requests_batch(&mut block, &vq, IO_URING_NUM_ENTRIES); @@ -1604,9 +1631,10 @@ mod tests { let mut block = default_block(FileEngineType::Async); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, IO_URING_NUM_ENTRIES * 4); block.queues[0] = vq.create_queue(); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); // Run scenario that triggers FullCqError. Push 2 * IO_URING_NUM_ENTRIES and wait for // completion. Then try to push another entry. @@ -1634,9 +1662,10 @@ mod tests { let mut block = default_block(engine); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); block.queues[0] = vq.create_queue(); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); // Add a batch of flush requests. add_flush_requests_batch(&mut block, &vq, 5); @@ -1653,9 +1682,10 @@ mod tests { for engine in [FileEngineType::Sync, FileEngineType::Async] { let mut block = default_block(engine); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -1722,9 +1752,10 @@ mod tests { for engine in [FileEngineType::Sync, FileEngineType::Async] { let mut block = default_block(engine); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -1804,6 +1835,11 @@ mod tests { fn test_update_disk_image() { for engine in [FileEngineType::Sync, FileEngineType::Async] { let mut block = default_block(engine); + let mem = default_mem(); + let interrupt = default_interrupt(); + let vq = VirtQueue::new(GuestAddress(0), &mem, 16); + set_queue(&mut block, 0, vq.create_queue()); + block.activate(mem, interrupt).unwrap(); let f = TempFile::new().unwrap(); let path = f.as_path(); let mdata = metadata(path).unwrap(); diff --git a/src/vmm/src/devices/virtio/block/virtio/event_handler.rs b/src/vmm/src/devices/virtio/block/virtio/event_handler.rs index db69e23d7f0..03c09a01972 100644 --- a/src/vmm/src/devices/virtio/block/virtio/event_handler.rs +++ b/src/vmm/src/devices/virtio/block/virtio/event_handler.rs @@ -124,7 +124,7 @@ mod tests { }; use crate::devices::virtio::block::virtio::{VIRTIO_BLK_S_OK, VIRTIO_BLK_T_OUT}; use crate::devices::virtio::queue::VIRTQ_DESC_F_NEXT; - use crate::devices::virtio::test_utils::{VirtQueue, default_mem}; + use crate::devices::virtio::test_utils::{VirtQueue, default_interrupt, default_mem}; use crate::vstate::memory::{Bytes, GuestAddress}; #[test] @@ -132,6 +132,7 @@ mod tests { let mut event_manager = EventManager::new().unwrap(); let mut block = default_block(FileEngineType::default()); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); read_blk_req_descriptors(&vq); @@ -162,7 +163,11 @@ mod tests { assert_eq!(ev_count, 0); // Now activate the device. - block.lock().unwrap().activate(mem.clone()).unwrap(); + block + .lock() + .unwrap() + .activate(mem.clone(), interrupt) + .unwrap(); // Process the activate event. let ev_count = event_manager.run_with_timeout(50).unwrap(); assert_eq!(ev_count, 1); diff --git a/src/vmm/src/devices/virtio/block/virtio/persist.rs b/src/vmm/src/devices/virtio/block/virtio/persist.rs index 33a33968e53..57e4a11b9c1 100644 --- a/src/vmm/src/devices/virtio/block/virtio/persist.rs +++ b/src/vmm/src/devices/virtio/block/virtio/persist.rs @@ -3,9 +3,6 @@ //! Defines the structures needed for saving/restoring block devices. -use std::sync::Arc; -use std::sync::atomic::AtomicU32; - use device::ConfigSpace; use serde::{Deserialize, Serialize}; use vmm_sys_util::eventfd::EventFd; @@ -16,10 +13,9 @@ use crate::devices::virtio::TYPE_BLOCK; use crate::devices::virtio::block::persist::BlockConstructorArgs; use crate::devices::virtio::block::virtio::device::FileEngineType; use crate::devices::virtio::block::virtio::metrics::BlockMetricsPerDevice; -use crate::devices::virtio::device::DeviceState; +use crate::devices::virtio::device::{ActiveState, DeviceState}; use crate::devices::virtio::generated::virtio_blk::VIRTIO_BLK_F_RO; use crate::devices::virtio::persist::VirtioDeviceState; -use crate::devices::virtio::transport::mmio::IrqTrigger; use crate::rate_limiter::RateLimiter; use crate::rate_limiter::persist::RateLimiterState; use crate::snapshot::Persist; @@ -112,14 +108,14 @@ impl Persist<'_> for VirtioBlock { ) .map_err(VirtioBlockError::Persist)?; - let mut irq_trigger = IrqTrigger::new(); - irq_trigger.irq_status = Arc::new(AtomicU32::new(state.virtio_state.interrupt_status)); - let avail_features = state.virtio_state.avail_features; let acked_features = state.virtio_state.acked_features; let device_state = if state.virtio_state.activated { - DeviceState::Activated(constructor_args.mem) + DeviceState::Activated(ActiveState { + mem: constructor_args.mem, + interrupt: constructor_args.interrupt, + }) } else { DeviceState::Inactive }; @@ -137,7 +133,6 @@ impl Persist<'_> for VirtioBlock { queues, queue_evts, device_state, - irq_trigger, id: state.id.clone(), partuuid: state.partuuid.clone(), @@ -155,14 +150,12 @@ impl Persist<'_> for VirtioBlock { #[cfg(test)] mod tests { - use std::sync::atomic::Ordering; - use vmm_sys_util::tempfile::TempFile; use super::*; use crate::devices::virtio::block::virtio::device::VirtioBlockConfig; use crate::devices::virtio::device::VirtioDevice; - use crate::devices::virtio::test_utils::default_mem; + use crate::devices::virtio::test_utils::{default_interrupt, default_mem}; use crate::snapshot::Snapshot; #[test] @@ -234,7 +227,10 @@ mod tests { // Restore the block device. let restored_block = VirtioBlock::restore( - BlockConstructorArgs { mem: guest_mem }, + BlockConstructorArgs { + mem: guest_mem, + interrupt: default_interrupt(), + }, &Snapshot::deserialize(&mut mem.as_slice()).unwrap(), ) .unwrap(); @@ -244,11 +240,8 @@ mod tests { assert_eq!(restored_block.avail_features(), block.avail_features()); assert_eq!(restored_block.acked_features(), block.acked_features()); assert_eq!(restored_block.queues(), block.queues()); - assert_eq!( - restored_block.interrupt_status().load(Ordering::Relaxed), - block.interrupt_status().load(Ordering::Relaxed) - ); - assert_eq!(restored_block.is_activated(), block.is_activated()); + assert!(!block.is_activated()); + assert!(!restored_block.is_activated()); // Test that block specific fields are the same. assert_eq!(restored_block.disk.file_path, block.disk.file_path); diff --git a/src/vmm/src/devices/virtio/block/virtio/test_utils.rs b/src/vmm/src/devices/virtio/block/virtio/test_utils.rs index b05e899f32d..14e2f1d33d0 100644 --- a/src/vmm/src/devices/virtio/block/virtio/test_utils.rs +++ b/src/vmm/src/devices/virtio/block/virtio/test_utils.rs @@ -16,6 +16,8 @@ use crate::devices::virtio::block::virtio::device::FileEngineType; #[cfg(test)] use crate::devices::virtio::block::virtio::io::FileEngine; use crate::devices::virtio::block::virtio::{CacheType, VirtioBlock}; +#[cfg(test)] +use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::queue::{Queue, VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; use crate::devices::virtio::test_utils::{VirtQueue, VirtqDesc}; #[cfg(test)] @@ -82,7 +84,10 @@ pub fn simulate_queue_event(b: &mut VirtioBlock, maybe_expected_irq: Option, +} + /// Enum that indicates if a VirtioDevice is inactive or has been activated /// and memory attached to it. #[derive(Debug)] pub enum DeviceState { Inactive, - Activated(GuestMemoryMmap), + Activated(ActiveState), } impl DeviceState { @@ -35,10 +42,10 @@ impl DeviceState { } } - /// Gets the memory attached to the device if it is activated. - pub fn mem(&self) -> Option<&GuestMemoryMmap> { + /// Gets the memory and interrupt attached to the device if it is activated. + pub fn active_state(&self) -> Option<&ActiveState> { match self { - DeviceState::Activated(mem) => Some(mem), + DeviceState::Activated(state) => Some(state), DeviceState::Inactive => None, } } @@ -130,7 +137,11 @@ pub trait VirtioDevice: AsAny + Send { fn write_config(&mut self, offset: u64, data: &[u8]); /// Performs the formal activation for a device, which can be verified also with `is_activated`. - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError>; + fn activate( + &mut self, + mem: GuestMemoryMmap, + interrupt: Arc, + ) -> Result<(), ActivateError>; /// Checks if the resources of this device are activated. fn is_activated(&self) -> bool; @@ -206,7 +217,11 @@ pub(crate) mod tests { todo!() } - fn activate(&mut self, _mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate( + &mut self, + _mem: GuestMemoryMmap, + _interrupt: Arc, + ) -> Result<(), ActivateError> { todo!() } diff --git a/src/vmm/src/devices/virtio/net/device.rs b/src/vmm/src/devices/virtio/net/device.rs index 55a94636495..9949b404809 100755 --- a/src/vmm/src/devices/virtio/net/device.rs +++ b/src/vmm/src/devices/virtio/net/device.rs @@ -15,7 +15,7 @@ use log::error; use vmm_sys_util::eventfd::EventFd; use super::NET_QUEUE_MAX_SIZE; -use crate::devices::virtio::device::{DeviceState, VirtioDevice}; +use crate::devices::virtio::device::{ActiveState, DeviceState, VirtioDevice}; use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; use crate::devices::virtio::generated::virtio_net::{ VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, @@ -250,8 +250,6 @@ pub struct Net { tx_frame_headers: [u8; frame_hdr_len()], - pub(crate) irq_trigger: IrqTrigger, - pub(crate) config_space: ConfigSpace, pub(crate) guest_mac: Option, @@ -314,7 +312,6 @@ impl Net { tx_rate_limiter, rx_frame_buf: [0u8; MAX_BUFFER_SIZE], tx_frame_headers: [0u8; frame_hdr_len()], - irq_trigger: IrqTrigger::new(), config_space, guest_mac, device_state: DeviceState::Inactive, @@ -400,7 +397,7 @@ impl Net { queue.advance_used_ring_idx(); if queue.prepare_kick() { - self.irq_trigger + self.interrupt_trigger() .trigger_irq(IrqType::Vring) .map_err(|err| { self.metrics.event_fails.inc(); @@ -465,7 +462,7 @@ impl Net { /// Parse available RX `DescriptorChains` from the queue pub fn parse_rx_descriptors(&mut self) -> Result<(), InvalidAvailIdx> { // This is safe since we checked in the event handler that the device is activated. - let mem = self.device_state.mem().unwrap(); + let mem = &self.device_state.active_state().unwrap().mem; let queue = &mut self.queues[RX_INDEX]; while let Some(head) = queue.pop_or_enable_notification()? { let index = head.index; @@ -687,7 +684,7 @@ impl Net { fn process_tx(&mut self) -> Result<(), DeviceError> { // This is safe since we checked in the event handler that the device is activated. - let mem = self.device_state.mem().unwrap(); + let mem = &self.device_state.active_state().unwrap().mem; // The MMDS network stack works like a state machine, based on synchronous calls, and // without being added to any event loop. If any frame is accepted by the MMDS, we also @@ -970,8 +967,13 @@ impl VirtioDevice for Net { } fn interrupt_trigger(&self) -> &IrqTrigger { - &self.irq_trigger + &self + .device_state + .active_state() + .expect("Device is not implemented") + .interrupt } + fn read_config(&self, offset: u64, data: &mut [u8]) { if let Some(config_space_bytes) = self.config_space.as_slice().get(u64_to_usize(offset)..) { let len = config_space_bytes.len().min(data.len()); @@ -1000,7 +1002,11 @@ impl VirtioDevice for Net { self.metrics.mac_address_updates.inc(); } - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate( + &mut self, + mem: GuestMemoryMmap, + interrupt: Arc, + ) -> Result<(), ActivateError> { for q in self.queues.iter_mut() { q.initialize(&mem) .map_err(ActivateError::QueueMemoryError)?; @@ -1024,7 +1030,7 @@ impl VirtioDevice for Net { self.metrics.activate_fails.inc(); return Err(ActivateError::EventFd); } - self.device_state = DeviceState::Activated(mem); + self.device_state = DeviceState::Activated(ActiveState { mem, interrupt }); Ok(()) } @@ -1403,7 +1409,7 @@ pub mod tests { // Check that the used queue has advanced. assert_eq!(th.rxq.used.idx.get(), 4); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); // Check that the invalid descriptor chains have been discarded th.rxq.check_used_elem(0, 0, 0); th.rxq.check_used_elem(1, 3, 0); @@ -1460,7 +1466,7 @@ pub mod tests { assert!(th.net().rx_buffer.used_descriptors == 0); // Check that the used queue has advanced. assert_eq!(th.rxq.used.idx.get(), 1); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); // Check that the frame has been written successfully to the Rx descriptor chain. header_set_num_buffers(frame.as_mut_slice(), 1); th.rxq @@ -1523,7 +1529,7 @@ pub mod tests { assert!(th.net().rx_buffer.used_bytes == 0); // Check that the used queue has advanced. assert_eq!(th.rxq.used.idx.get(), 2); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); // Check that the 1st frame was written successfully to the 1st Rx descriptor chain. header_set_num_buffers(frame_1.as_mut_slice(), 1); th.rxq @@ -1581,7 +1587,7 @@ pub mod tests { assert!(th.net().rx_buffer.used_bytes == 0); // Check that the used queue has advanced. assert_eq!(th.rxq.used.idx.get(), 2); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); // 2 chains should be used for the packet. header_set_num_buffers(frame.as_mut_slice(), 2); @@ -1646,7 +1652,7 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); th.txq.check_used_elem(0, 0, 0); // Check that the frame was skipped. assert!(!tap_traffic_simulator.pop_rx_packet(&mut [])); @@ -1669,7 +1675,7 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); th.txq.check_used_elem(0, 0, 0); // Check that the frame was skipped. assert!(!tap_traffic_simulator.pop_rx_packet(&mut [])); @@ -1696,7 +1702,7 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); th.txq.check_used_elem(0, 0, 0); // Check that the frame was skipped. assert!(!tap_traffic_simulator.pop_rx_packet(&mut [])); @@ -1719,7 +1725,7 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); th.txq.check_used_elem(0, 0, 0); // Check that the frame was skipped. assert!(!tap_traffic_simulator.pop_rx_packet(&mut [])); @@ -1758,7 +1764,7 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 4); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); th.txq.check_used_elem(3, 4, 0); // Check that the valid frame was sent to the tap. let mut buf = vec![0; 1000]; @@ -1789,7 +1795,7 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); th.txq.check_used_elem(0, 3, 0); // Check that the frame was sent to the tap. let mut buf = vec![0; 1000]; @@ -1818,7 +1824,7 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); th.txq.check_used_elem(0, 0, 0); // dropping th would double close the tap fd, so leak it @@ -1849,7 +1855,7 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 2); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); th.txq.check_used_elem(0, 0, 0); th.txq.check_used_elem(1, 3, 0); // Check that the first frame was sent to the tap. @@ -2201,7 +2207,7 @@ pub mod tests { assert_eq!(th.net().metrics.rx_rate_limiter_throttled.count(), 1); assert!(th.net().rx_buffer.used_descriptors != 0); // assert that no operation actually completed (limiter blocked it) - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); // make sure the data is still queued for processing assert_eq!(th.rxq.used.idx.get(), 0); } @@ -2229,7 +2235,7 @@ pub mod tests { // validate the rate_limiter is no longer blocked assert!(!th.net().rx_rate_limiter.is_blocked()); // make sure the virtio queue operation completed this time - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); // make sure the data queue advanced assert_eq!(th.rxq.used.idx.get(), 1); th.rxq @@ -2326,14 +2332,14 @@ pub mod tests { assert!(th.net().metrics.rx_rate_limiter_throttled.count() >= 1); assert!(th.net().rx_buffer.used_descriptors != 0); // assert that no operation actually completed (limiter blocked it) - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); // make sure the data is still queued for processing assert_eq!(th.rxq.used.idx.get(), 0); // trigger the RX handler again, this time it should do the limiter fast path exit th.simulate_event(NetEvent::Tap); // assert that no operation actually completed, that the limiter blocked it - assert!(!&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(!th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); // make sure the data is still queued for processing assert_eq!(th.rxq.used.idx.get(), 0); } @@ -2346,7 +2352,7 @@ pub mod tests { { th.simulate_event(NetEvent::RxRateLimiter); // make sure the virtio queue operation completed this time - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); // make sure the data queue advanced assert_eq!(th.rxq.used.idx.get(), 1); th.rxq @@ -2416,7 +2422,7 @@ pub mod tests { assert_eq!(net.queue_events().len(), NET_QUEUE_SIZES.len()); // Test interrupts. - assert!(!&net.irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(!net.interrupt_trigger().has_pending_irq(IrqType::Vring)); } #[test] diff --git a/src/vmm/src/devices/virtio/net/persist.rs b/src/vmm/src/devices/virtio/net/persist.rs index 50e761273db..9072d3dd5e7 100644 --- a/src/vmm/src/devices/virtio/net/persist.rs +++ b/src/vmm/src/devices/virtio/net/persist.rs @@ -4,7 +4,6 @@ //! Defines the structures needed for saving/restoring net devices. use std::io; -use std::sync::atomic::AtomicU32; use std::sync::{Arc, Mutex}; use serde::{Deserialize, Serialize}; @@ -12,8 +11,9 @@ use serde::{Deserialize, Serialize}; use super::device::{Net, RxBuffers}; use super::{NET_NUM_QUEUES, NET_QUEUE_MAX_SIZE, RX_INDEX, TapError}; use crate::devices::virtio::TYPE_NET; -use crate::devices::virtio::device::DeviceState; +use crate::devices::virtio::device::{ActiveState, DeviceState}; use crate::devices::virtio::persist::{PersistError as VirtioStateError, VirtioDeviceState}; +use crate::devices::virtio::transport::mmio::IrqTrigger; use crate::mmds::data_store::Mmds; use crate::mmds::ns::MmdsNetworkStack; use crate::mmds::persist::MmdsNetworkStackState; @@ -71,6 +71,8 @@ pub struct NetState { pub struct NetConstructorArgs { /// Pointer to guest memory. pub mem: GuestMemoryMmap, + /// Interrupt for the device. + pub interrupt: Arc, /// Pointer to the MMDS data store. pub mmds: Option>>, } @@ -148,7 +150,6 @@ impl Persist<'_> for Net { NET_NUM_QUEUES, NET_QUEUE_MAX_SIZE, )?; - net.irq_trigger.irq_status = Arc::new(AtomicU32::new(state.virtio_state.interrupt_status)); net.avail_features = state.virtio_state.avail_features; net.acked_features = state.virtio_state.acked_features; @@ -158,7 +159,10 @@ impl Persist<'_> for Net { .set_offload(supported_flags) .map_err(NetPersistError::TapSetOffload)?; - net.device_state = DeviceState::Activated(constructor_args.mem); + net.device_state = DeviceState::Activated(ActiveState { + mem: constructor_args.mem, + interrupt: constructor_args.interrupt, + }); // Recreate `Net::rx_buffer`. We do it by re-parsing the RX queue. We're temporarily // rolling back `next_avail` in the RX queue and call `parse_rx_descriptors`. @@ -175,12 +179,11 @@ impl Persist<'_> for Net { #[cfg(test)] mod tests { - use std::sync::atomic::Ordering; use super::*; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::net::test_utils::{default_net, default_net_no_mmds}; - use crate::devices::virtio::test_utils::default_mem; + use crate::devices::virtio::test_utils::{default_interrupt, default_mem}; use crate::snapshot::Snapshot; fn validate_save_and_restore(net: Net, mmds_ds: Option>>) { @@ -213,6 +216,7 @@ mod tests { match Net::restore( NetConstructorArgs { mem: guest_mem, + interrupt: default_interrupt(), mmds: mmds_ds, }, &Snapshot::deserialize(&mut mem.as_slice()).unwrap(), @@ -222,10 +226,6 @@ mod tests { assert_eq!(restored_net.device_type(), TYPE_NET); assert_eq!(restored_net.avail_features(), virtio_state.avail_features); assert_eq!(restored_net.acked_features(), virtio_state.acked_features); - assert_eq!( - restored_net.interrupt_status().load(Ordering::Relaxed), - virtio_state.interrupt_status - ); assert_eq!(restored_net.is_activated(), virtio_state.activated); // Test that net specific fields are the same. diff --git a/src/vmm/src/devices/virtio/net/test_utils.rs b/src/vmm/src/devices/virtio/net/test_utils.rs index ec52883e979..c81ad58205c 100644 --- a/src/vmm/src/devices/virtio/net/test_utils.rs +++ b/src/vmm/src/devices/virtio/net/test_utils.rs @@ -299,7 +299,7 @@ pub mod test { }; use crate::devices::virtio::net::{MAX_BUFFER_SIZE, Net, RX_INDEX, TX_INDEX}; use crate::devices::virtio::queue::{VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; - use crate::devices::virtio::test_utils::{VirtQueue, VirtqDesc}; + use crate::devices::virtio::test_utils::{VirtQueue, VirtqDesc, default_interrupt}; use crate::devices::virtio::transport::mmio::IrqType; use crate::logger::IncMetric; use crate::vstate::memory::{Address, Bytes, GuestAddress, GuestMemoryMmap}; @@ -359,7 +359,12 @@ pub mod test { } pub fn activate_net(&mut self) { - self.net.lock().unwrap().activate(self.mem.clone()).unwrap(); + let interrupt = default_interrupt(); + self.net + .lock() + .unwrap() + .activate(self.mem.clone(), interrupt) + .unwrap(); // Process the activate event. let ev_count = self.event_manager.run_with_timeout(100).unwrap(); assert_eq!(ev_count, 1); @@ -436,7 +441,11 @@ pub mod test { old_used_descriptors + 1 ); - assert!(&self.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!( + self.net() + .interrupt_trigger() + .has_pending_irq(IrqType::Vring) + ); frame } @@ -462,7 +471,11 @@ pub mod test { ); // Check that the expected frame was sent to the Rx queue eventually. assert_eq!(self.rxq.used.idx.get(), used_idx + 1); - assert!(&self.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!( + self.net() + .interrupt_trigger() + .has_pending_irq(IrqType::Vring) + ); self.rxq .check_used_elem(used_idx, 0, expected_frame.len().try_into().unwrap()); self.rxq.dtable[0].check_data(expected_frame); diff --git a/src/vmm/src/devices/virtio/persist.rs b/src/vmm/src/devices/virtio/persist.rs index 1a1eb6dba7d..776c7179048 100644 --- a/src/vmm/src/devices/virtio/persist.rs +++ b/src/vmm/src/devices/virtio/persist.rs @@ -10,6 +10,7 @@ use std::sync::{Arc, Mutex}; use serde::{Deserialize, Serialize}; use super::queue::{InvalidAvailIdx, QueueError}; +use super::transport::mmio::IrqTrigger; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::generated::virtio_ring::VIRTIO_RING_F_EVENT_IDX; use crate::devices::virtio::queue::Queue; @@ -123,8 +124,6 @@ pub struct VirtioDeviceState { pub acked_features: u64, /// List of queues. pub queues: Vec, - /// The MMIO interrupt status. - pub interrupt_status: u32, /// Flag for activated status. pub activated: bool, } @@ -137,7 +136,6 @@ impl VirtioDeviceState { avail_features: device.avail_features(), acked_features: device.acked_features(), queues: device.queues().iter().map(Persist::save).collect(), - interrupt_status: device.interrupt_status().load(Ordering::Relaxed), activated: device.is_activated(), } } @@ -202,6 +200,7 @@ pub struct MmioTransportState { queue_select: u32, device_status: u32, config_generation: u32, + interrupt_status: u32, } /// Auxiliary structure for initializing the transport when resuming from a snapshot. @@ -209,6 +208,8 @@ pub struct MmioTransportState { pub struct MmioTransportConstructorArgs { /// Pointer to guest memory. pub mem: GuestMemoryMmap, + /// Interrupt to use for the device + pub interrupt: Arc, /// Device associated with the current MMIO state. pub device: Arc>, /// Is device backed by vhost-user. @@ -227,6 +228,7 @@ impl Persist<'_> for MmioTransport { queue_select: self.queue_select, device_status: self.device_status, config_generation: self.config_generation, + interrupt_status: self.interrupt.irq_status.load(Ordering::SeqCst), } } @@ -236,6 +238,7 @@ impl Persist<'_> for MmioTransport { ) -> Result { let mut transport = MmioTransport::new( constructor_args.mem, + constructor_args.interrupt, constructor_args.device, constructor_args.is_vhost_user, ); @@ -244,6 +247,10 @@ impl Persist<'_> for MmioTransport { transport.queue_select = state.queue_select; transport.device_status = state.device_status; transport.config_generation = state.config_generation; + transport + .interrupt + .irq_status + .store(state.interrupt_status, Ordering::SeqCst); Ok(transport) } } @@ -383,7 +390,7 @@ mod tests { self.queue_select == other.queue_select && self.device_status == other.device_status && self.config_generation == other.config_generation && - self.interrupt_status.load(Ordering::SeqCst) == other.interrupt_status.load(Ordering::SeqCst) && + self.interrupt.irq_status.load(Ordering::SeqCst) == other.interrupt.irq_status.load(Ordering::SeqCst) && // Only checking equality of device type, actual device (de)ser is tested by that // device's tests. self_dev_type == other.device().lock().unwrap().device_type() @@ -392,6 +399,7 @@ mod tests { fn generic_mmiotransport_persistence_test( mmio_transport: MmioTransport, + interrupt: Arc, mem: GuestMemoryMmap, device: Arc>, ) { @@ -401,6 +409,7 @@ mod tests { let restore_args = MmioTransportConstructorArgs { mem, + interrupt, device, is_vhost_user: false, }; @@ -413,8 +422,14 @@ mod tests { assert_eq!(restored_mmio_transport, mmio_transport); } - fn create_default_block() -> (MmioTransport, GuestMemoryMmap, Arc>) { + fn create_default_block() -> ( + MmioTransport, + Arc, + GuestMemoryMmap, + Arc>, + ) { let mem = default_mem(); + let interrupt = Arc::new(IrqTrigger::new()); // Create backing file. let f = TempFile::new().unwrap(); @@ -424,25 +439,34 @@ mod tests { FileEngineType::default(), ); let block = Arc::new(Mutex::new(block)); - let mmio_transport = MmioTransport::new(mem.clone(), block.clone(), false); + let mmio_transport = + MmioTransport::new(mem.clone(), interrupt.clone(), block.clone(), false); - (mmio_transport, mem, block) + (mmio_transport, interrupt, mem, block) } - fn create_default_net() -> (MmioTransport, GuestMemoryMmap, Arc>) { + fn create_default_net() -> ( + MmioTransport, + Arc, + GuestMemoryMmap, + Arc>, + ) { let mem = default_mem(); + let interrupt = Arc::new(IrqTrigger::new()); let net = Arc::new(Mutex::new(default_net())); - let mmio_transport = MmioTransport::new(mem.clone(), net.clone(), false); + let mmio_transport = MmioTransport::new(mem.clone(), interrupt.clone(), net.clone(), false); - (mmio_transport, mem, net) + (mmio_transport, interrupt, mem, net) } fn default_vsock() -> ( MmioTransport, + Arc, GuestMemoryMmap, Arc>>, ) { let mem = default_mem(); + let interrupt = Arc::new(IrqTrigger::new()); let guest_cid = 52; let mut temp_uds_path = TempFile::new().unwrap(); @@ -452,26 +476,27 @@ mod tests { let backend = VsockUnixBackend::new(guest_cid, uds_path).unwrap(); let vsock = Vsock::new(guest_cid, backend).unwrap(); let vsock = Arc::new(Mutex::new(vsock)); - let mmio_transport = MmioTransport::new(mem.clone(), vsock.clone(), false); + let mmio_transport = + MmioTransport::new(mem.clone(), interrupt.clone(), vsock.clone(), false); - (mmio_transport, mem, vsock) + (mmio_transport, interrupt, mem, vsock) } #[test] fn test_block_over_mmiotransport_persistence() { - let (mmio_transport, mem, block) = create_default_block(); - generic_mmiotransport_persistence_test(mmio_transport, mem, block); + let (mmio_transport, interrupt, mem, block) = create_default_block(); + generic_mmiotransport_persistence_test(mmio_transport, interrupt, mem, block); } #[test] fn test_net_over_mmiotransport_persistence() { - let (mmio_transport, mem, net) = create_default_net(); - generic_mmiotransport_persistence_test(mmio_transport, mem, net); + let (mmio_transport, interrupt, mem, net) = create_default_net(); + generic_mmiotransport_persistence_test(mmio_transport, interrupt, mem, net); } #[test] fn test_vsock_over_mmiotransport_persistence() { - let (mmio_transport, mem, vsock) = default_vsock(); - generic_mmiotransport_persistence_test(mmio_transport, mem, vsock); + let (mmio_transport, interrupt, mem, vsock) = default_vsock(); + generic_mmiotransport_persistence_test(mmio_transport, interrupt, mem, vsock); } } diff --git a/src/vmm/src/devices/virtio/queue.rs b/src/vmm/src/devices/virtio/queue.rs index ec845fe6394..9977070293e 100644 --- a/src/vmm/src/devices/virtio/queue.rs +++ b/src/vmm/src/devices/virtio/queue.rs @@ -20,7 +20,7 @@ pub(super) const FIRECRACKER_MAX_QUEUE_SIZE: u16 = 256; // GuestMemoryMmap::read_obj_from_addr() will be used to fetch the descriptor, // which has an explicit constraint that the entire descriptor doesn't -// cross the page boundary. Otherwise the descriptor may be splitted into +// cross the page boundary. Otherwise the descriptor may be split into // two mmap regions which causes failure of GuestMemoryMmap::read_obj_from_addr(). // // The Virtio Spec 1.0 defines the alignment of VirtIO descriptor is 16 bytes, diff --git a/src/vmm/src/devices/virtio/rng/device.rs b/src/vmm/src/devices/virtio/rng/device.rs index d644161d87e..1433a7086e2 100644 --- a/src/vmm/src/devices/virtio/rng/device.rs +++ b/src/vmm/src/devices/virtio/rng/device.rs @@ -3,7 +3,6 @@ use std::io; use std::sync::Arc; -use std::sync::atomic::AtomicU32; use aws_lc_rs::rand; use vm_memory::GuestMemoryError; @@ -12,7 +11,7 @@ use vmm_sys_util::eventfd::EventFd; use super::metrics::METRICS; use super::{RNG_NUM_QUEUES, RNG_QUEUE}; use crate::devices::DeviceError; -use crate::devices::virtio::device::{DeviceState, VirtioDevice}; +use crate::devices::virtio::device::{ActiveState, DeviceState, VirtioDevice}; use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; use crate::devices::virtio::iov_deque::IovDequeError; use crate::devices::virtio::iovec::IoVecBufferMut; @@ -48,7 +47,6 @@ pub struct Entropy { device_state: DeviceState, pub(crate) queues: Vec, queue_events: Vec, - irq_trigger: IrqTrigger, // Device specific fields rate_limiter: RateLimiter, @@ -70,7 +68,6 @@ impl Entropy { let queue_events = (0..RNG_NUM_QUEUES) .map(|_| EventFd::new(libc::EFD_NONBLOCK)) .collect::, io::Error>>()?; - let irq_trigger = IrqTrigger::new(); Ok(Self { avail_features: 1 << VIRTIO_F_VERSION_1, @@ -79,7 +76,6 @@ impl Entropy { device_state: DeviceState::Inactive, queues, queue_events, - irq_trigger, rate_limiter, buffer: IoVecBufferMut::new()?, }) @@ -90,7 +86,7 @@ impl Entropy { } fn signal_used_queue(&self) -> Result<(), DeviceError> { - self.irq_trigger + self.interrupt_trigger() .trigger_irq(IrqType::Vring) .map_err(DeviceError::FailedSignalingIrq) } @@ -133,7 +129,7 @@ impl Entropy { let mut used_any = false; while let Some(desc) = self.queues[RNG_QUEUE].pop()? { // This is safe since we checked in the event handler that the device is activated. - let mem = self.device_state.mem().unwrap(); + let mem = &self.device_state.active_state().unwrap().mem; let index = desc.index; METRICS.entropy_event_count.inc(); @@ -240,12 +236,8 @@ impl Entropy { self.acked_features = features; } - pub(crate) fn set_irq_status(&mut self, status: u32) { - self.irq_trigger.irq_status = Arc::new(AtomicU32::new(status)); - } - - pub(crate) fn set_activated(&mut self, mem: GuestMemoryMmap) { - self.device_state = DeviceState::Activated(mem); + pub(crate) fn set_activated(&mut self, mem: GuestMemoryMmap, interrupt: Arc) { + self.device_state = DeviceState::Activated(ActiveState { mem, interrupt }); } pub(crate) fn activate_event(&self) -> &EventFd { @@ -271,7 +263,11 @@ impl VirtioDevice for Entropy { } fn interrupt_trigger(&self) -> &IrqTrigger { - &self.irq_trigger + &self + .device_state + .active_state() + .expect("Device is not initialized") + .interrupt } fn avail_features(&self) -> u64 { @@ -294,7 +290,11 @@ impl VirtioDevice for Entropy { self.device_state.is_activated() } - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate( + &mut self, + mem: GuestMemoryMmap, + interrupt: Arc, + ) -> Result<(), ActivateError> { for q in self.queues.iter_mut() { q.initialize(&mem) .map_err(ActivateError::QueueMemoryError)?; @@ -304,7 +304,7 @@ impl VirtioDevice for Entropy { METRICS.activate_fails.inc(); ActivateError::EventFd })?; - self.device_state = DeviceState::Activated(mem); + self.device_state = DeviceState::Activated(ActiveState { mem, interrupt }); Ok(()) } } diff --git a/src/vmm/src/devices/virtio/rng/persist.rs b/src/vmm/src/devices/virtio/rng/persist.rs index 2f2519b4962..dd2d62debee 100644 --- a/src/vmm/src/devices/virtio/rng/persist.rs +++ b/src/vmm/src/devices/virtio/rng/persist.rs @@ -3,12 +3,15 @@ //! Defines the structures needed for saving/restoring entropy devices. +use std::sync::Arc; + use serde::{Deserialize, Serialize}; use crate::devices::virtio::TYPE_RNG; use crate::devices::virtio::persist::{PersistError as VirtioStateError, VirtioDeviceState}; use crate::devices::virtio::queue::FIRECRACKER_MAX_QUEUE_SIZE; use crate::devices::virtio::rng::{Entropy, EntropyError, RNG_NUM_QUEUES}; +use crate::devices::virtio::transport::mmio::IrqTrigger; use crate::rate_limiter::RateLimiter; use crate::rate_limiter::persist::RateLimiterState; use crate::snapshot::Persist; @@ -21,11 +24,14 @@ pub struct EntropyState { } #[derive(Debug)] -pub struct EntropyConstructorArgs(GuestMemoryMmap); +pub struct EntropyConstructorArgs { + mem: GuestMemoryMmap, + interrupt: Arc, +} impl EntropyConstructorArgs { - pub fn new(mem: GuestMemoryMmap) -> Self { - Self(mem) + pub fn new(mem: GuestMemoryMmap, interrupt: Arc) -> Self { + Self { mem, interrupt } } } @@ -56,7 +62,7 @@ impl Persist<'_> for Entropy { state: &Self::State, ) -> Result { let queues = state.virtio_state.build_queues_checked( - &constructor_args.0, + &constructor_args.mem, TYPE_RNG, RNG_NUM_QUEUES, FIRECRACKER_MAX_QUEUE_SIZE, @@ -66,9 +72,8 @@ impl Persist<'_> for Entropy { let mut entropy = Entropy::new_with_queues(queues, rate_limiter)?; entropy.set_avail_features(state.virtio_state.avail_features); entropy.set_acked_features(state.virtio_state.acked_features); - entropy.set_irq_status(state.virtio_state.interrupt_status); if state.virtio_state.activated { - entropy.set_activated(constructor_args.0); + entropy.set_activated(constructor_args.mem, constructor_args.interrupt); } Ok(entropy) @@ -77,11 +82,11 @@ impl Persist<'_> for Entropy { #[cfg(test)] mod tests { - use std::sync::atomic::Ordering; use super::*; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::rng::device::ENTROPY_DEV_ID; + use crate::devices::virtio::test_utils::default_interrupt; use crate::devices::virtio::test_utils::test::create_virtio_mem; use crate::snapshot::Snapshot; @@ -94,19 +99,16 @@ mod tests { let guest_mem = create_virtio_mem(); let restored = Entropy::restore( - EntropyConstructorArgs(guest_mem), + EntropyConstructorArgs::new(guest_mem, default_interrupt()), &Snapshot::deserialize(&mut mem.as_slice()).unwrap(), ) .unwrap(); assert_eq!(restored.device_type(), TYPE_RNG); assert_eq!(restored.id(), ENTROPY_DEV_ID); - assert_eq!(restored.is_activated(), entropy.is_activated()); + assert!(!restored.is_activated()); + assert!(!entropy.is_activated()); assert_eq!(restored.avail_features(), entropy.avail_features()); assert_eq!(restored.acked_features(), entropy.acked_features()); - assert_eq!( - restored.interrupt_status().load(Ordering::Relaxed), - entropy.interrupt_status().load(Ordering::Relaxed) - ); } } diff --git a/src/vmm/src/devices/virtio/test_utils.rs b/src/vmm/src/devices/virtio/test_utils.rs index 8642d0a85f4..29fbdc5ec56 100644 --- a/src/vmm/src/devices/virtio/test_utils.rs +++ b/src/vmm/src/devices/virtio/test_utils.rs @@ -6,6 +6,7 @@ use std::fmt::Debug; use std::marker::PhantomData; use std::mem; +use std::sync::Arc; use std::sync::atomic::{AtomicUsize, Ordering}; use crate::devices::virtio::queue::Queue; @@ -13,6 +14,8 @@ use crate::test_utils::single_region_mem; use crate::utils::{align_up, u64_to_usize}; use crate::vstate::memory::{Address, Bytes, GuestAddress, GuestMemoryMmap}; +use super::transport::mmio::IrqTrigger; + #[macro_export] macro_rules! check_metric_after_block { ($metric:expr, $delta:expr, $block:expr) => {{ @@ -28,6 +31,11 @@ pub fn default_mem() -> GuestMemoryMmap { single_region_mem(0x10000) } +/// Creates a default ['IrqTrigger'] interrupt for a VirtIO device. +pub fn default_interrupt() -> Arc { + Arc::new(IrqTrigger::new()) +} + #[derive(Debug)] pub struct InputData { pub data: Vec, @@ -323,7 +331,7 @@ pub(crate) mod test { use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::net::MAX_BUFFER_SIZE; use crate::devices::virtio::queue::{Queue, VIRTQ_DESC_F_NEXT}; - use crate::devices::virtio::test_utils::{VirtQueue, VirtqDesc}; + use crate::devices::virtio::test_utils::{VirtQueue, VirtqDesc, default_interrupt}; use crate::test_utils::single_region_mem; use crate::vstate::memory::{Address, GuestAddress, GuestMemoryMmap}; @@ -414,7 +422,12 @@ pub(crate) mod test { /// Activate the device pub fn activate_device(&mut self, mem: &'a GuestMemoryMmap) { - self.device.lock().unwrap().activate(mem.clone()).unwrap(); + let interrupt = default_interrupt(); + self.device + .lock() + .unwrap() + .activate(mem.clone(), interrupt) + .unwrap(); // Process the activate event let ev_count = self.event_manager.run_with_timeout(100).unwrap(); assert_eq!(ev_count, 1); diff --git a/src/vmm/src/devices/virtio/transport/mmio.rs b/src/vmm/src/devices/virtio/transport/mmio.rs index a763ff811dc..f1a8c8bfabf 100644 --- a/src/vmm/src/devices/virtio/transport/mmio.rs +++ b/src/vmm/src/devices/virtio/transport/mmio.rs @@ -57,7 +57,7 @@ pub struct MmioTransport { pub(crate) device_status: u32, pub(crate) config_generation: u32, mem: GuestMemoryMmap, - pub(crate) interrupt_status: Arc, + pub(crate) interrupt: Arc, pub is_vhost_user: bool, } @@ -65,11 +65,10 @@ impl MmioTransport { /// Constructs a new MMIO transport for the given virtio device. pub fn new( mem: GuestMemoryMmap, + interrupt: Arc, device: Arc>, is_vhost_user: bool, ) -> MmioTransport { - let interrupt_status = device.lock().expect("Poisoned lock").interrupt_status(); - MmioTransport { device, features_select: 0, @@ -78,7 +77,7 @@ impl MmioTransport { device_status: device_status::INIT, config_generation: 0, mem, - interrupt_status, + interrupt, is_vhost_user, } } @@ -146,7 +145,7 @@ impl MmioTransport { self.features_select = 0; self.acked_features_select = 0; self.queue_select = 0; - self.interrupt_status.store(0, Ordering::SeqCst); + self.interrupt.irq_status.store(0, Ordering::SeqCst); self.device_status = device_status::INIT; // . Keep interrupt_evt and queue_evts as is. There may be pending notifications in those // eventfds, but nothing will happen other than supurious wakeups. @@ -182,7 +181,9 @@ impl MmioTransport { let device_activated = self.locked_device().is_activated(); if !device_activated { // temporary variable needed for borrow checker - let activate_result = self.locked_device().activate(self.mem.clone()); + let activate_result = self + .locked_device() + .activate(self.mem.clone(), self.interrupt.clone()); if let Err(err) = activate_result { self.device_status |= DEVICE_NEEDS_RESET; @@ -265,7 +266,7 @@ impl MmioTransport { // `VIRTIO_MMIO_INT_CONFIG` or not to understand if we need to send // `VIRTIO_MMIO_INT_CONFIG` or // `VIRTIO_MMIO_INT_VRING`. - let is = self.interrupt_status.load(Ordering::SeqCst); + let is = self.interrupt.irq_status.load(Ordering::SeqCst); if !self.is_vhost_user { is } else if is == VIRTIO_MMIO_INT_CONFIG { @@ -326,7 +327,7 @@ impl MmioTransport { 0x44 => self.update_queue_field(|q| q.ready = v == 1), 0x64 => { if self.check_device_status(device_status::DRIVER_OK, 0) { - self.interrupt_status.fetch_and(!v, Ordering::SeqCst); + self.interrupt.irq_status.fetch_and(!v, Ordering::SeqCst); } } 0x70 => self.set_device_status(v), @@ -407,6 +408,7 @@ impl IrqTrigger { #[cfg(test)] pub(crate) mod tests { + use vmm_sys_util::eventfd::EventFd; use super::*; @@ -421,7 +423,7 @@ pub(crate) mod tests { pub(crate) struct DummyDevice { acked_features: u64, avail_features: u64, - interrupt_trigger: IrqTrigger, + interrupt_trigger: Option>, queue_evts: Vec, queues: Vec, device_activated: bool, @@ -434,7 +436,7 @@ pub(crate) mod tests { DummyDevice { acked_features: 0, avail_features: 0, - interrupt_trigger: IrqTrigger::new(), + interrupt_trigger: None, queue_evts: vec![ EventFd::new(libc::EFD_NONBLOCK).unwrap(), EventFd::new(libc::EFD_NONBLOCK).unwrap(), @@ -481,7 +483,9 @@ pub(crate) mod tests { } fn interrupt_trigger(&self) -> &IrqTrigger { - &self.interrupt_trigger + self.interrupt_trigger + .as_ref() + .expect("Device is not activated") } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -494,8 +498,13 @@ pub(crate) mod tests { } } - fn activate(&mut self, _: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate( + &mut self, + _: GuestMemoryMmap, + interrupt: Arc, + ) -> Result<(), ActivateError> { self.device_activated = true; + self.interrupt_trigger = Some(interrupt); if self.activate_should_error { Err(ActivateError::EventFd) } else { @@ -517,10 +526,11 @@ pub(crate) mod tests { #[test] fn test_new() { let m = single_region_mem(0x1000); + let interrupt = Arc::new(IrqTrigger::new()); let mut dummy = DummyDevice::new(); // Validate reset is no-op. assert!(dummy.reset().is_none()); - let mut d = MmioTransport::new(m, Arc::new(Mutex::new(dummy)), false); + let mut d = MmioTransport::new(m, interrupt, Arc::new(Mutex::new(dummy)), false); // We just make sure here that the implementation of a mmio device behaves as we expect, // given a known virtio device implementation (the dummy device). @@ -545,7 +555,13 @@ pub(crate) mod tests { #[test] fn test_bus_device_read() { let m = single_region_mem(0x1000); - let mut d = MmioTransport::new(m, Arc::new(Mutex::new(DummyDevice::new())), false); + let interrupt = Arc::new(IrqTrigger::new()); + let mut d = MmioTransport::new( + m, + interrupt, + Arc::new(Mutex::new(DummyDevice::new())), + false, + ); let mut buf = vec![0xff, 0, 0xfe, 0]; let buf_copy = buf.to_vec(); @@ -592,17 +608,18 @@ pub(crate) mod tests { d.bus_read(0x44, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), u32::from(false)); - d.interrupt_status.store(111, Ordering::SeqCst); + d.interrupt.irq_status.store(111, Ordering::SeqCst); d.bus_read(0x60, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), 111); d.is_vhost_user = true; - d.interrupt_status.store(0, Ordering::SeqCst); + d.interrupt.irq_status.store(0, Ordering::SeqCst); d.bus_read(0x60, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), VIRTIO_MMIO_INT_VRING); d.is_vhost_user = true; - d.interrupt_status + d.interrupt + .irq_status .store(VIRTIO_MMIO_INT_CONFIG, Ordering::SeqCst); d.bus_read(0x60, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), VIRTIO_MMIO_INT_CONFIG); @@ -634,8 +651,9 @@ pub(crate) mod tests { #[allow(clippy::cognitive_complexity)] fn test_bus_device_write() { let m = single_region_mem(0x1000); + let interrupt = Arc::new(IrqTrigger::new()); let dummy_dev = Arc::new(Mutex::new(DummyDevice::new())); - let mut d = MmioTransport::new(m, dummy_dev.clone(), false); + let mut d = MmioTransport::new(m, interrupt, dummy_dev.clone(), false); let mut buf = vec![0; 5]; write_le_u32(&mut buf[..4], 1); @@ -762,10 +780,10 @@ pub(crate) mod tests { | device_status::DRIVER_OK, ); - d.interrupt_status.store(0b10_1010, Ordering::Relaxed); + d.interrupt.irq_status.store(0b10_1010, Ordering::Relaxed); write_le_u32(&mut buf[..], 0b111); d.bus_write(0x64, &buf[..]); - assert_eq!(d.interrupt_status.load(Ordering::Relaxed), 0b10_1000); + assert_eq!(d.interrupt.irq_status.load(Ordering::Relaxed), 0b10_1000); // Write to an invalid address in generic register range. write_le_u32(&mut buf[..], 0xf); @@ -796,7 +814,13 @@ pub(crate) mod tests { #[test] fn test_bus_device_activate() { let m = single_region_mem(0x1000); - let mut d = MmioTransport::new(m, Arc::new(Mutex::new(DummyDevice::new())), false); + let interrupt = Arc::new(IrqTrigger::new()); + let mut d = MmioTransport::new( + m, + interrupt, + Arc::new(Mutex::new(DummyDevice::new())), + false, + ); assert!(!d.locked_device().is_activated()); assert_eq!(d.device_status, device_status::INIT); @@ -873,11 +897,12 @@ pub(crate) mod tests { #[test] fn test_bus_device_activate_failure() { let m = single_region_mem(0x1000); + let interrupt = Arc::new(IrqTrigger::new()); let device = DummyDevice { activate_should_error: true, ..DummyDevice::new() }; - let mut d = MmioTransport::new(m, Arc::new(Mutex::new(device)), false); + let mut d = MmioTransport::new(m, interrupt, Arc::new(Mutex::new(device)), false); set_device_status(&mut d, device_status::ACKNOWLEDGE); set_device_status(&mut d, device_status::ACKNOWLEDGE | device_status::DRIVER); @@ -895,10 +920,7 @@ pub(crate) mod tests { write_le_u32(&mut buf[..], 1); d.bus_write(0x44, &buf[..]); } - assert_eq!( - d.locked_device().interrupt_status().load(Ordering::SeqCst), - 0 - ); + assert!(!d.locked_device().is_activated()); set_device_status( &mut d, @@ -967,7 +989,13 @@ pub(crate) mod tests { #[test] fn test_bus_device_reset() { let m = single_region_mem(0x1000); - let mut d = MmioTransport::new(m, Arc::new(Mutex::new(DummyDevice::new())), false); + let interrupt = Arc::new(IrqTrigger::new()); + let mut d = MmioTransport::new( + m, + interrupt, + Arc::new(Mutex::new(DummyDevice::new())), + false, + ); let mut buf = [0; 4]; assert!(!d.locked_device().is_activated()); diff --git a/src/vmm/src/devices/virtio/vsock/device.rs b/src/vmm/src/devices/virtio/vsock/device.rs index fc51a61532c..ad049b517e4 100644 --- a/src/vmm/src/devices/virtio/vsock/device.rs +++ b/src/vmm/src/devices/virtio/vsock/device.rs @@ -21,6 +21,7 @@ //! - a backend FD. use std::fmt::Debug; +use std::sync::Arc; use log::{error, warn}; use vmm_sys_util::eventfd::EventFd; @@ -30,7 +31,7 @@ use super::defs::uapi; use super::packet::{VSOCK_PKT_HDR_SIZE, VsockPacketRx, VsockPacketTx}; use super::{VsockBackend, defs}; use crate::devices::virtio::ActivateError; -use crate::devices::virtio::device::{DeviceState, VirtioDevice}; +use crate::devices::virtio::device::{ActiveState, DeviceState, VirtioDevice}; use crate::devices::virtio::generated::virtio_config::{VIRTIO_F_IN_ORDER, VIRTIO_F_VERSION_1}; use crate::devices::virtio::queue::{InvalidAvailIdx, Queue as VirtQueue}; use crate::devices::virtio::transport::mmio::{IrqTrigger, IrqType}; @@ -62,7 +63,6 @@ pub struct Vsock { pub(crate) backend: B, pub(crate) avail_features: u64, pub(crate) acked_features: u64, - pub(crate) irq_trigger: IrqTrigger, // This EventFd is the only one initially registered for a vsock device, and is used to convert // a VirtioDevice::activate call into an EventHandler read event which allows the other events // (queue and backend related) to be registered post virtio device activation. That's @@ -103,7 +103,6 @@ where backend, avail_features: AVAIL_FEATURES, acked_features: 0, - irq_trigger: IrqTrigger::new(), activate_evt: EventFd::new(libc::EFD_NONBLOCK).map_err(VsockError::EventFd)?, device_state: DeviceState::Inactive, rx_packet: VsockPacketRx::new()?, @@ -138,7 +137,10 @@ where /// Signal the guest driver that we've used some virtio buffers that it had previously made /// available. pub fn signal_used_queue(&self) -> Result<(), DeviceError> { - self.irq_trigger + self.device_state + .active_state() + .expect("Device is not initialized") + .interrupt .trigger_irq(IrqType::Vring) .map_err(DeviceError::FailedSignalingIrq) } @@ -148,7 +150,7 @@ where /// otherwise. pub fn process_rx(&mut self) -> Result { // This is safe since we checked in the event handler that the device is activated. - let mem = self.device_state.mem().unwrap(); + let mem = &self.device_state.active_state().unwrap().mem; let queue = &mut self.queues[RXQ_INDEX]; let mut have_used = false; @@ -201,7 +203,7 @@ where /// ring, and `false` otherwise. pub fn process_tx(&mut self) -> Result { // This is safe since we checked in the event handler that the device is activated. - let mem = self.device_state.mem().unwrap(); + let mem = &self.device_state.active_state().unwrap().mem; let queue = &mut self.queues[TXQ_INDEX]; let mut have_used = false; @@ -241,7 +243,7 @@ where // remain but their CID is updated to reflect the current guest_cid. pub fn send_transport_reset_event(&mut self) -> Result<(), DeviceError> { // This is safe since we checked in the caller function that the device is activated. - let mem = self.device_state.mem().unwrap(); + let mem = &self.device_state.active_state().unwrap().mem; let queue = &mut self.queues[EVQ_INDEX]; let head = queue.pop()?.ok_or_else(|| { @@ -296,7 +298,11 @@ where } fn interrupt_trigger(&self) -> &IrqTrigger { - &self.irq_trigger + &self + .device_state + .active_state() + .expect("Device is not initialized") + .interrupt } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -328,7 +334,11 @@ where ); } - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate( + &mut self, + mem: GuestMemoryMmap, + interrupt: Arc, + ) -> Result<(), ActivateError> { for q in self.queues.iter_mut() { q.initialize(&mem) .map_err(ActivateError::QueueMemoryError)?; @@ -347,7 +357,7 @@ where return Err(ActivateError::EventFd); } - self.device_state = DeviceState::Activated(mem); + self.device_state = DeviceState::Activated(ActiveState { mem, interrupt }); Ok(()) } @@ -430,6 +440,8 @@ mod tests { // } // Test a correct activation. - ctx.device.activate(ctx.mem.clone()).unwrap(); + ctx.device + .activate(ctx.mem.clone(), ctx.interrupt.clone()) + .unwrap(); } } diff --git a/src/vmm/src/devices/virtio/vsock/event_handler.rs b/src/vmm/src/devices/virtio/vsock/event_handler.rs index 59fbd3eaa3d..9c909048a69 100755 --- a/src/vmm/src/devices/virtio/vsock/event_handler.rs +++ b/src/vmm/src/devices/virtio/vsock/event_handler.rs @@ -240,7 +240,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); ctx.device.backend.set_pending_rx(false); ctx.signal_txq_event(); @@ -257,7 +257,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); ctx.device.backend.set_pending_rx(true); ctx.signal_txq_event(); @@ -273,7 +273,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); ctx.device.backend.set_pending_rx(false); ctx.device.backend.set_tx_err(Some(VsockError::NoData)); @@ -289,7 +289,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); // Invalidate the descriptor chain, by setting its length to 0. ctx.guest_txvq.dtable[0].len.set(0); @@ -306,7 +306,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); assert!(!ctx.device.handle_txq_event(EventSet::IN)); } @@ -321,7 +321,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); ctx.device.backend.set_pending_rx(true); ctx.device.backend.set_rx_err(Some(VsockError::NoData)); @@ -338,7 +338,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); ctx.device.backend.set_pending_rx(true); ctx.signal_rxq_event(); @@ -351,7 +351,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); // Invalidate the descriptor chain, by setting its length to 0. ctx.guest_rxvq.dtable[0].len.set(0); @@ -367,7 +367,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); ctx.device.backend.set_pending_rx(false); assert!(!ctx.device.handle_rxq_event(EventSet::IN)); } @@ -392,7 +392,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); ctx.device.backend.set_pending_rx(true); ctx.device.notify_backend(EventSet::IN).unwrap(); @@ -411,7 +411,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); ctx.device.backend.set_pending_rx(false); ctx.device.notify_backend(EventSet::IN).unwrap(); @@ -454,7 +454,7 @@ mod tests { { let mut ctx = test_ctx.create_event_handler_context(); - // When modifiyng the buffer descriptor, make sure the len field is altered in the + // When modifying the buffer descriptor, make sure the len field is altered in the // vsock packet header descriptor as well. if desc_idx == 1 { // The vsock packet len field has offset 24 in the header. @@ -582,7 +582,7 @@ mod tests { vsock .lock() .unwrap() - .activate(test_ctx.mem.clone()) + .activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()) .unwrap(); // Process the activate event. let ev_count = event_manager.run_with_timeout(50).unwrap(); diff --git a/src/vmm/src/devices/virtio/vsock/persist.rs b/src/vmm/src/devices/virtio/vsock/persist.rs index fce6affae69..3d0967926be 100644 --- a/src/vmm/src/devices/virtio/vsock/persist.rs +++ b/src/vmm/src/devices/virtio/vsock/persist.rs @@ -5,14 +5,14 @@ use std::fmt::Debug; use std::sync::Arc; -use std::sync::atomic::AtomicU32; use serde::{Deserialize, Serialize}; use super::*; -use crate::devices::virtio::device::DeviceState; +use crate::devices::virtio::device::{ActiveState, DeviceState}; use crate::devices::virtio::persist::VirtioDeviceState; use crate::devices::virtio::queue::FIRECRACKER_MAX_QUEUE_SIZE; +use crate::devices::virtio::transport::mmio::IrqTrigger; use crate::devices::virtio::vsock::TYPE_VSOCK; use crate::snapshot::Persist; use crate::vstate::memory::GuestMemoryMmap; @@ -29,7 +29,7 @@ pub struct VsockState { /// The Vsock frontend serializable state. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct VsockFrontendState { - /// Context IDentifier. + /// Context Identifier. pub cid: u64, virtio_state: VirtioDeviceState, } @@ -53,6 +53,8 @@ pub struct VsockUdsState { pub struct VsockConstructorArgs { /// Pointer to guest memory. pub mem: GuestMemoryMmap, + /// Interrupt to use for the device. + pub interrupt: Arc, /// The vsock Unix Backend. pub backend: B, } @@ -121,10 +123,11 @@ where vsock.acked_features = state.virtio_state.acked_features; vsock.avail_features = state.virtio_state.avail_features; - vsock.irq_trigger.irq_status = - Arc::new(AtomicU32::new(state.virtio_state.interrupt_status)); vsock.device_state = if state.virtio_state.activated { - DeviceState::Activated(constructor_args.mem) + DeviceState::Activated(ActiveState { + mem: constructor_args.mem, + interrupt: constructor_args.interrupt, + }) } else { DeviceState::Inactive }; @@ -137,6 +140,7 @@ pub(crate) mod tests { use super::device::AVAIL_FEATURES; use super::*; use crate::devices::virtio::device::VirtioDevice; + use crate::devices::virtio::test_utils::default_interrupt; use crate::devices::virtio::vsock::defs::uapi; use crate::devices::virtio::vsock::test_utils::{TestBackend, TestContext}; use crate::snapshot::Snapshot; @@ -189,6 +193,7 @@ pub(crate) mod tests { let mut restored_device = Vsock::restore( VsockConstructorArgs { mem: ctx.mem.clone(), + interrupt: default_interrupt(), backend: match restored_state.backend { VsockBackendState::Uds(uds_state) => { assert_eq!(uds_state.path, "test".to_owned()); diff --git a/src/vmm/src/devices/virtio/vsock/test_utils.rs b/src/vmm/src/devices/virtio/vsock/test_utils.rs index 921c2e79bdb..56795e5fd36 100644 --- a/src/vmm/src/devices/virtio/vsock/test_utils.rs +++ b/src/vmm/src/devices/virtio/vsock/test_utils.rs @@ -5,6 +5,7 @@ #![doc(hidden)] use std::os::unix::io::{AsRawFd, RawFd}; +use std::sync::Arc; use vmm_sys_util::epoll::EventSet; use vmm_sys_util::eventfd::EventFd; @@ -12,7 +13,8 @@ use vmm_sys_util::eventfd::EventFd; use super::packet::{VsockPacketRx, VsockPacketTx}; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::queue::{VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; -use crate::devices::virtio::test_utils::VirtQueue as GuestQ; +use crate::devices::virtio::test_utils::{VirtQueue as GuestQ, default_interrupt}; +use crate::devices::virtio::transport::mmio::IrqTrigger; use crate::devices::virtio::vsock::device::{RXQ_INDEX, TXQ_INDEX}; use crate::devices::virtio::vsock::packet::VSOCK_PKT_HDR_SIZE; use crate::devices::virtio::vsock::{ @@ -117,6 +119,7 @@ impl VsockBackend for TestBackend {} pub struct TestContext { pub cid: u64, pub mem: GuestMemoryMmap, + pub interrupt: Arc, pub mem_size: usize, pub device: Vsock, } @@ -134,6 +137,7 @@ impl TestContext { Self { cid: CID, mem, + interrupt: default_interrupt(), mem_size: MEM_SIZE, device, } @@ -196,9 +200,9 @@ pub struct EventHandlerContext<'a> { } impl EventHandlerContext<'_> { - pub fn mock_activate(&mut self, mem: GuestMemoryMmap) { + pub fn mock_activate(&mut self, mem: GuestMemoryMmap, interrupt: Arc) { // Artificially activate the device. - self.device.activate(mem).unwrap(); + self.device.activate(mem, interrupt).unwrap(); } pub fn signal_txq_event(&mut self) { From 68318a39ba6b53b970b7abf0061c33397a96ec00 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 16 Apr 2025 15:10:31 +0200 Subject: [PATCH 04/99] virtio: add generic interrupt trait Describing the APIs that need to implement types that are used as interrupts for VirtIO devices. Currently, we only use `IrqInterrupt` interrupts, but this will change once we have MSI-X with PCIe devices. Signed-off-by: Babis Chalios --- src/vmm/src/devices/virtio/transport/mmio.rs | 64 +++++++++++++++++--- src/vmm/src/devices/virtio/transport/mod.rs | 32 ++++++++++ 2 files changed, 88 insertions(+), 8 deletions(-) diff --git a/src/vmm/src/devices/virtio/transport/mmio.rs b/src/vmm/src/devices/virtio/transport/mmio.rs index f1a8c8bfabf..224f086fdbb 100644 --- a/src/vmm/src/devices/virtio/transport/mmio.rs +++ b/src/vmm/src/devices/virtio/transport/mmio.rs @@ -11,6 +11,7 @@ use std::sync::{Arc, Mutex, MutexGuard}; use vmm_sys_util::eventfd::EventFd; +use super::{VirtioInterrupt, VirtioInterruptType}; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::device_status; use crate::devices::virtio::queue::Queue; @@ -368,6 +369,15 @@ pub enum IrqType { Vring, } +impl From for IrqType { + fn from(interrupt_type: VirtioInterruptType) -> Self { + match interrupt_type { + VirtioInterruptType::Config => IrqType::Config, + VirtioInterruptType::Queue(_) => IrqType::Vring, + } + } +} + /// Helper struct that is responsible for triggering guest IRQs #[derive(Debug)] pub struct IrqTrigger { @@ -381,6 +391,40 @@ impl Default for IrqTrigger { } } +impl VirtioInterrupt for IrqTrigger { + fn trigger(&self, interrupt_type: VirtioInterruptType) -> Result<(), std::io::Error> { + match interrupt_type { + VirtioInterruptType::Config => self.trigger_irq(IrqType::Config), + VirtioInterruptType::Queue(_) => self.trigger_irq(IrqType::Vring), + } + } + + fn notifier(&self, _interrupt_type: VirtioInterruptType) -> Option<&EventFd> { + Some(&self.irq_evt) + } + + fn status(&self) -> Arc { + self.irq_status.clone() + } + + #[cfg(test)] + fn has_pending_interrupt(&self, interrupt_type: VirtioInterruptType) -> bool { + if let Ok(num_irqs) = self.irq_evt.read() { + if num_irqs == 0 { + return false; + } + + let irq_status = self.irq_status.load(Ordering::SeqCst); + return matches!( + (irq_status, interrupt_type.into()), + (VIRTIO_MMIO_INT_CONFIG, IrqType::Config) | (VIRTIO_MMIO_INT_VRING, IrqType::Vring) + ); + } + + false + } +} + impl IrqTrigger { pub fn new() -> Self { Self { @@ -1070,19 +1114,23 @@ pub(crate) mod tests { assert_eq!(irq_trigger.irq_status.load(Ordering::SeqCst), 0); // Check that there are no pending irqs. - assert!(!irq_trigger.has_pending_irq(IrqType::Config)); - assert!(!irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(!irq_trigger.has_pending_interrupt(VirtioInterruptType::Config)); + assert!(!irq_trigger.has_pending_interrupt(VirtioInterruptType::Queue(0))); // Check that trigger_irq() correctly generates irqs. - irq_trigger.trigger_irq(IrqType::Config).unwrap(); - assert!(irq_trigger.has_pending_irq(IrqType::Config)); + irq_trigger.trigger(VirtioInterruptType::Config).unwrap(); + assert!(irq_trigger.has_pending_interrupt(VirtioInterruptType::Config)); irq_trigger.irq_status.store(0, Ordering::SeqCst); - irq_trigger.trigger_irq(IrqType::Vring).unwrap(); - assert!(irq_trigger.has_pending_irq(IrqType::Vring)); + irq_trigger.trigger(VirtioInterruptType::Queue(0)).unwrap(); + assert!(irq_trigger.has_pending_interrupt(VirtioInterruptType::Queue(0))); // Check trigger_irq() failure case (irq_evt is full). irq_trigger.irq_evt.write(u64::MAX - 1).unwrap(); - irq_trigger.trigger_irq(IrqType::Config).unwrap_err(); - irq_trigger.trigger_irq(IrqType::Vring).unwrap_err(); + irq_trigger + .trigger(VirtioInterruptType::Config) + .unwrap_err(); + irq_trigger + .trigger(VirtioInterruptType::Queue(0)) + .unwrap_err(); } } diff --git a/src/vmm/src/devices/virtio/transport/mod.rs b/src/vmm/src/devices/virtio/transport/mod.rs index 1ff8229a1c8..d41ad943aa2 100644 --- a/src/vmm/src/devices/virtio/transport/mod.rs +++ b/src/vmm/src/devices/virtio/transport/mod.rs @@ -1,5 +1,37 @@ // Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +use std::sync::Arc; +use std::sync::atomic::AtomicU32; + +use vmm_sys_util::eventfd::EventFd; + /// MMIO transport for VirtIO devices pub mod mmio; + +/// Represents the types of interrupts used by VirtIO devices +#[derive(Debug, Clone)] +pub enum VirtioInterruptType { + /// Interrupt for VirtIO configuration changes + Config, + /// Interrupts for new events in a queue. + Queue(u16), +} + +/// API of interrupt types used by VirtIO devices +pub trait VirtioInterrupt: std::fmt::Debug + Send + Sync { + /// Trigger a VirtIO interrupt. + fn trigger(&self, interrupt_type: VirtioInterruptType) -> Result<(), std::io::Error>; + + /// Get the `EventFd` (if any) that backs the underlying interrupt. + fn notifier(&self, _interrupt_type: VirtioInterruptType) -> Option<&EventFd> { + None + } + + /// Get the current device interrupt status. + fn status(&self) -> Arc; + + /// Returns true if there is any pending interrupt + #[cfg(test)] + fn has_pending_interrupt(&self, interrupt_type: VirtioInterruptType) -> bool; +} From f2b0b4b1f684fab6905ea4ca66dc182db1be08a5 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 22 Apr 2025 10:01:51 +0200 Subject: [PATCH 05/99] refactor: use VirtioInterrupt in VirtIO devices VirtIO devices assume they're operating under an MMIO transport and as a consequence they use IrqTrigger as interrupts. Switch that to using VirtioInterrupt for all VirtIO device objects. Only assume a VirtioInterrupt is an IrqTrigger in MMIO specific code. Signed-off-by: Babis Chalios --- src/vmm/src/device_manager/mmio.rs | 12 +- src/vmm/src/devices/virtio/balloon/device.rs | 30 ++-- src/vmm/src/devices/virtio/balloon/persist.rs | 4 +- .../src/devices/virtio/balloon/test_utils.rs | 10 +- src/vmm/src/devices/virtio/block/device.rs | 6 +- src/vmm/src/devices/virtio/block/persist.rs | 4 +- .../devices/virtio/block/vhost_user/device.rs | 20 +-- .../devices/virtio/block/vhost_user/mod.rs | 2 +- .../src/devices/virtio/block/virtio/device.rs | 25 ++-- .../src/devices/virtio/block/virtio/mod.rs | 4 +- .../devices/virtio/block/virtio/test_utils.rs | 9 +- src/vmm/src/devices/virtio/device.rs | 14 +- src/vmm/src/devices/virtio/net/device.rs | 136 ++++++++++++++---- src/vmm/src/devices/virtio/net/persist.rs | 4 +- src/vmm/src/devices/virtio/net/test_utils.rs | 6 +- src/vmm/src/devices/virtio/rng/device.rs | 19 ++- src/vmm/src/devices/virtio/rng/persist.rs | 6 +- src/vmm/src/devices/virtio/test_utils.rs | 6 +- src/vmm/src/devices/virtio/transport/mmio.rs | 35 ++--- src/vmm/src/devices/virtio/vhost_user.rs | 28 +++- src/vmm/src/devices/virtio/vsock/device.rs | 19 +-- .../src/devices/virtio/vsock/event_handler.rs | 5 +- src/vmm/src/devices/virtio/vsock/persist.rs | 4 +- .../src/devices/virtio/vsock/test_utils.rs | 6 +- 24 files changed, 258 insertions(+), 156 deletions(-) diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index ea9e5c6ab37..f99db17e747 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -504,7 +504,7 @@ impl MMIODeviceManager { .unwrap(); if vsock.is_activated() { info!("kick vsock {id}."); - vsock.signal_used_queue().unwrap(); + vsock.signal_used_queue(0).unwrap(); } } TYPE_RNG => { @@ -524,6 +524,7 @@ impl MMIODeviceManager { #[cfg(test)] mod tests { + use std::ops::Deref; use std::sync::Arc; use vmm_sys_util::eventfd::EventFd; @@ -533,6 +534,7 @@ mod tests { use crate::devices::virtio::ActivateError; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::queue::Queue; + use crate::devices::virtio::transport::VirtioInterrupt; use crate::devices::virtio::transport::mmio::IrqTrigger; use crate::test_utils::multi_region_mem_raw; use crate::vstate::kvm::Kvm; @@ -619,10 +621,8 @@ mod tests { &self.queue_evts } - fn interrupt_trigger(&self) -> &IrqTrigger { - self.interrupt_trigger - .as_ref() - .expect("Device is not activated") + fn interrupt_trigger(&self) -> &dyn VirtioInterrupt { + self.interrupt_trigger.as_ref().unwrap().deref() } fn ack_features_by_page(&mut self, page: u32, value: u32) { @@ -643,7 +643,7 @@ mod tests { fn activate( &mut self, _: GuestMemoryMmap, - _: Arc, + _: Arc, ) -> Result<(), ActivateError> { Ok(()) } diff --git a/src/vmm/src/devices/virtio/balloon/device.rs b/src/vmm/src/devices/virtio/balloon/device.rs index 35824de1e80..3cfcbed4465 100644 --- a/src/vmm/src/devices/virtio/balloon/device.rs +++ b/src/vmm/src/devices/virtio/balloon/device.rs @@ -1,6 +1,7 @@ // Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +use std::ops::Deref; use std::sync::Arc; use std::time::Duration; @@ -27,7 +28,7 @@ use crate::devices::virtio::balloon::BalloonError; use crate::devices::virtio::device::ActiveState; use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; use crate::devices::virtio::queue::InvalidAvailIdx; -use crate::devices::virtio::transport::mmio::{IrqTrigger, IrqType}; +use crate::devices::virtio::transport::{VirtioInterrupt, VirtioInterruptType}; use crate::logger::IncMetric; use crate::utils::u64_to_usize; use crate::vstate::memory::{Address, ByteValued, Bytes, GuestAddress, GuestMemoryMmap}; @@ -342,7 +343,7 @@ impl Balloon { queue.advance_used_ring_idx(); if needs_interrupt { - self.signal_used_queue()?; + self.signal_used_queue(INFLATE_INDEX)?; } Ok(()) @@ -361,7 +362,7 @@ impl Balloon { queue.advance_used_ring_idx(); if needs_interrupt { - self.signal_used_queue() + self.signal_used_queue(DEFLATE_INDEX) } else { Ok(()) } @@ -403,9 +404,12 @@ impl Balloon { Ok(()) } - pub(crate) fn signal_used_queue(&self) -> Result<(), BalloonError> { + pub(crate) fn signal_used_queue(&self, qidx: usize) -> Result<(), BalloonError> { self.interrupt_trigger() - .trigger_irq(IrqType::Vring) + .trigger(VirtioInterruptType::Queue( + qidx.try_into() + .unwrap_or_else(|_| panic!("balloon: invalid queue id: {qidx}")), + )) .map_err(|err| { METRICS.event_fails.inc(); BalloonError::InterruptError(err) @@ -435,7 +439,7 @@ impl Balloon { if let Some(index) = self.stats_desc_index.take() { self.queues[STATS_INDEX].add_used(index, 0)?; self.queues[STATS_INDEX].advance_used_ring_idx(); - self.signal_used_queue() + self.signal_used_queue(STATS_INDEX) } else { error!("Failed to update balloon stats, missing descriptor."); Ok(()) @@ -447,7 +451,7 @@ impl Balloon { if self.is_activated() { self.config_space.num_pages = mib_to_pages(amount_mib)?; self.interrupt_trigger() - .trigger_irq(IrqType::Config) + .trigger(VirtioInterruptType::Config) .map_err(BalloonError::InterruptError) } else { Err(BalloonError::DeviceNotActive) @@ -558,12 +562,12 @@ impl VirtioDevice for Balloon { &self.queue_evts } - fn interrupt_trigger(&self) -> &IrqTrigger { - &self - .device_state + fn interrupt_trigger(&self) -> &dyn VirtioInterrupt { + self.device_state .active_state() .expect("Device is not activated") .interrupt + .deref() } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -593,7 +597,7 @@ impl VirtioDevice for Balloon { fn activate( &mut self, mem: GuestMemoryMmap, - interrupt: Arc, + interrupt: Arc, ) -> Result<(), ActivateError> { for q in self.queues.iter_mut() { q.initialize(&mem) @@ -1070,7 +1074,9 @@ pub(crate) mod tests { assert!(balloon.stats_desc_index.is_some()); balloon.process_stats_timer_event().unwrap(); assert!(balloon.stats_desc_index.is_none()); - assert!(balloon.interrupt_trigger().has_pending_irq(IrqType::Vring)); + assert!(balloon.interrupt_trigger().has_pending_interrupt( + VirtioInterruptType::Queue(STATS_INDEX.try_into().unwrap()) + )); }); } } diff --git a/src/vmm/src/devices/virtio/balloon/persist.rs b/src/vmm/src/devices/virtio/balloon/persist.rs index 397dd8aeb3e..a6634d07170 100644 --- a/src/vmm/src/devices/virtio/balloon/persist.rs +++ b/src/vmm/src/devices/virtio/balloon/persist.rs @@ -15,7 +15,7 @@ use crate::devices::virtio::balloon::device::{BalloonStats, ConfigSpace}; use crate::devices::virtio::device::{ActiveState, DeviceState}; use crate::devices::virtio::persist::VirtioDeviceState; use crate::devices::virtio::queue::FIRECRACKER_MAX_QUEUE_SIZE; -use crate::devices::virtio::transport::mmio::IrqTrigger; +use crate::devices::virtio::transport::VirtioInterrupt; use crate::snapshot::Persist; use crate::vstate::memory::GuestMemoryMmap; @@ -96,7 +96,7 @@ pub struct BalloonConstructorArgs { /// Pointer to guest memory. pub mem: GuestMemoryMmap, /// Interrupt used from the device. - pub interrupt: Arc, + pub interrupt: Arc, pub restored_from_file: bool, } diff --git a/src/vmm/src/devices/virtio/balloon/test_utils.rs b/src/vmm/src/devices/virtio/balloon/test_utils.rs index e588abaedee..2665d5dbd87 100644 --- a/src/vmm/src/devices/virtio/balloon/test_utils.rs +++ b/src/vmm/src/devices/virtio/balloon/test_utils.rs @@ -3,6 +3,8 @@ #![doc(hidden)] +#[cfg(test)] +use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::test_utils::VirtQueue; #[cfg(test)] use crate::devices::virtio::{balloon::BALLOON_NUM_QUEUES, balloon::Balloon}; @@ -10,8 +12,7 @@ use crate::devices::virtio::{balloon::BALLOON_NUM_QUEUES, balloon::Balloon}; #[cfg(test)] pub fn invoke_handler_for_queue_event(b: &mut Balloon, queue_index: usize) { use crate::devices::virtio::balloon::{DEFLATE_INDEX, INFLATE_INDEX, STATS_INDEX}; - use crate::devices::virtio::device::VirtioDevice; - use crate::devices::virtio::transport::mmio::IrqType; + use crate::devices::virtio::transport::VirtioInterruptType; assert!(queue_index < BALLOON_NUM_QUEUES); // Trigger the queue event. @@ -25,7 +26,10 @@ pub fn invoke_handler_for_queue_event(b: &mut Balloon, queue_index: usize) { }; // Validate the queue operation finished successfully. let interrupt = b.interrupt_trigger(); - assert!(interrupt.has_pending_irq(IrqType::Vring)); + assert!( + interrupt + .has_pending_interrupt(VirtioInterruptType::Queue(queue_index.try_into().unwrap())) + ); } pub fn set_request(queue: &VirtQueue, idx: u16, addr: u64, len: u32, flags: u16) { diff --git a/src/vmm/src/devices/virtio/block/device.rs b/src/vmm/src/devices/virtio/block/device.rs index 5a491c537c5..d58550acc59 100644 --- a/src/vmm/src/devices/virtio/block/device.rs +++ b/src/vmm/src/devices/virtio/block/device.rs @@ -12,7 +12,7 @@ use super::vhost_user::device::{VhostUserBlock, VhostUserBlockConfig}; use super::virtio::device::{VirtioBlock, VirtioBlockConfig}; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::queue::{InvalidAvailIdx, Queue}; -use crate::devices::virtio::transport::mmio::IrqTrigger; +use crate::devices::virtio::transport::VirtioInterrupt; use crate::devices::virtio::{ActivateError, TYPE_BLOCK}; use crate::rate_limiter::BucketUpdate; use crate::snapshot::Persist; @@ -176,7 +176,7 @@ impl VirtioDevice for Block { } } - fn interrupt_trigger(&self) -> &IrqTrigger { + fn interrupt_trigger(&self) -> &dyn VirtioInterrupt { match self { Self::Virtio(b) => b.interrupt_trigger(), Self::VhostUser(b) => b.interrupt_trigger(), @@ -200,7 +200,7 @@ impl VirtioDevice for Block { fn activate( &mut self, mem: GuestMemoryMmap, - interrupt: Arc, + interrupt: Arc, ) -> Result<(), ActivateError> { match self { Self::Virtio(b) => b.activate(mem, interrupt), diff --git a/src/vmm/src/devices/virtio/block/persist.rs b/src/vmm/src/devices/virtio/block/persist.rs index e7ae1768cca..57712a8fb3a 100644 --- a/src/vmm/src/devices/virtio/block/persist.rs +++ b/src/vmm/src/devices/virtio/block/persist.rs @@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize}; use super::vhost_user::persist::VhostUserBlockState; use super::virtio::persist::VirtioBlockState; -use crate::devices::virtio::transport::mmio::IrqTrigger; +use crate::devices::virtio::transport::VirtioInterrupt; use crate::vstate::memory::GuestMemoryMmap; /// Block device state. @@ -21,5 +21,5 @@ pub enum BlockState { #[derive(Debug)] pub struct BlockConstructorArgs { pub mem: GuestMemoryMmap, - pub interrupt: Arc, + pub interrupt: Arc, } diff --git a/src/vmm/src/devices/virtio/block/vhost_user/device.rs b/src/vmm/src/devices/virtio/block/vhost_user/device.rs index 22429996d5f..1d6c2aac080 100644 --- a/src/vmm/src/devices/virtio/block/vhost_user/device.rs +++ b/src/vmm/src/devices/virtio/block/vhost_user/device.rs @@ -4,6 +4,7 @@ // Portions Copyright 2019 Intel Corporation. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +use std::ops::Deref; use std::sync::Arc; use log::error; @@ -14,13 +15,12 @@ use vmm_sys_util::eventfd::EventFd; use super::{NUM_QUEUES, QUEUE_SIZE, VhostUserBlockError}; use crate::devices::virtio::block::CacheType; -use crate::devices::virtio::device::ActiveState; -use crate::devices::virtio::device::{DeviceState, VirtioDevice}; +use crate::devices::virtio::device::{ActiveState, DeviceState, VirtioDevice}; use crate::devices::virtio::generated::virtio_blk::{VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_RO}; use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; use crate::devices::virtio::generated::virtio_ring::VIRTIO_RING_F_EVENT_IDX; use crate::devices::virtio::queue::Queue; -use crate::devices::virtio::transport::mmio::{IrqTrigger, IrqType}; +use crate::devices::virtio::transport::{VirtioInterrupt, VirtioInterruptType}; use crate::devices::virtio::vhost_user::{VhostUserHandleBackend, VhostUserHandleImpl}; use crate::devices::virtio::vhost_user_metrics::{ VhostUserDeviceMetrics, VhostUserMetricsPerDevice, @@ -275,8 +275,8 @@ impl VhostUserBlockImpl { .map_err(VhostUserBlockError::Vhost)?; self.config_space = new_config_space; interrupt - .trigger_irq(IrqType::Config) - .map_err(VhostUserBlockError::IrqTrigger)?; + .trigger(VirtioInterruptType::Config) + .map_err(VhostUserBlockError::Interrupt)?; let delta_us = get_time_us(ClockType::Monotonic) - start_time; self.metrics.config_change_time_us.store(delta_us); @@ -314,12 +314,12 @@ impl VirtioDevice for VhostUserBlock &self.queue_evts } - fn interrupt_trigger(&self) -> &IrqTrigger { - &self - .device_state + fn interrupt_trigger(&self) -> &dyn VirtioInterrupt { + self.device_state .active_state() .expect("Device is not initialized") .interrupt + .deref() } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -341,7 +341,7 @@ impl VirtioDevice for VhostUserBlock fn activate( &mut self, mem: GuestMemoryMmap, - interrupt: Arc, + interrupt: Arc, ) -> Result<(), ActivateError> { for q in self.queues.iter_mut() { q.initialize(&mem) @@ -357,7 +357,7 @@ impl VirtioDevice for VhostUserBlock self.vu_handle.setup_backend( &mem, &[(0, &self.queues[0], &self.queue_evts[0])], - &interrupt, + interrupt.clone(), ) }) .map_err(|err| { diff --git a/src/vmm/src/devices/virtio/block/vhost_user/mod.rs b/src/vmm/src/devices/virtio/block/vhost_user/mod.rs index 8d4d9f44261..0afaaed3400 100644 --- a/src/vmm/src/devices/virtio/block/vhost_user/mod.rs +++ b/src/vmm/src/devices/virtio/block/vhost_user/mod.rs @@ -28,5 +28,5 @@ pub enum VhostUserBlockError { /// Error opening eventfd: {0} EventFd(std::io::Error), /// Error creating irqfd: {0} - IrqTrigger(std::io::Error), + Interrupt(std::io::Error), } diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs index 413410f2af6..d04fd5674ea 100644 --- a/src/vmm/src/devices/virtio/block/virtio/device.rs +++ b/src/vmm/src/devices/virtio/block/virtio/device.rs @@ -9,6 +9,7 @@ use std::cmp; use std::convert::From; use std::fs::{File, OpenOptions}; use std::io::{Seek, SeekFrom}; +use std::ops::Deref; use std::os::linux::fs::MetadataExt; use std::path::PathBuf; use std::sync::Arc; @@ -30,7 +31,7 @@ use crate::devices::virtio::generated::virtio_blk::{ use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; use crate::devices::virtio::generated::virtio_ring::VIRTIO_RING_F_EVENT_IDX; use crate::devices::virtio::queue::{InvalidAvailIdx, Queue}; -use crate::devices::virtio::transport::mmio::{IrqTrigger, IrqType}; +use crate::devices::virtio::transport::{VirtioInterrupt, VirtioInterruptType}; use crate::devices::virtio::{ActivateError, TYPE_BLOCK}; use crate::logger::{IncMetric, error, warn}; use crate::rate_limiter::{BucketUpdate, RateLimiter}; @@ -446,7 +447,7 @@ impl VirtioBlock { if used_any && queue.prepare_kick() { active_state .interrupt - .trigger_irq(IrqType::Vring) + .trigger(VirtioInterruptType::Queue(0)) .unwrap_or_else(|_| { self.metrics.event_fails.inc(); }); @@ -509,7 +510,7 @@ impl VirtioBlock { if queue.prepare_kick() { active_state .interrupt - .trigger_irq(IrqType::Vring) + .trigger(VirtioInterruptType::Queue(0)) .unwrap_or_else(|_| { self.metrics.event_fails.inc(); }); @@ -536,10 +537,12 @@ impl VirtioBlock { self.disk.update(disk_image_path, self.read_only)?; self.config_space.capacity = self.disk.nsectors.to_le(); // virtio_block_config_space(); - // Kick the driver to pick up the changes. - self.interrupt_trigger() - .trigger_irq(IrqType::Config) - .unwrap(); + // Kick the driver to pick up the changes. (But only if the device is already activated). + if self.is_activated() { + self.interrupt_trigger() + .trigger(VirtioInterruptType::Config) + .unwrap(); + } self.metrics.update_count.inc(); Ok(()) @@ -606,12 +609,12 @@ impl VirtioDevice for VirtioBlock { &self.queue_evts } - fn interrupt_trigger(&self) -> &IrqTrigger { - &self - .device_state + fn interrupt_trigger(&self) -> &dyn VirtioInterrupt { + self.device_state .active_state() .expect("Device is not initialized") .interrupt + .deref() } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -643,7 +646,7 @@ impl VirtioDevice for VirtioBlock { fn activate( &mut self, mem: GuestMemoryMmap, - interrupt: Arc, + interrupt: Arc, ) -> Result<(), ActivateError> { for q in self.queues.iter_mut() { q.initialize(&mem) diff --git a/src/vmm/src/devices/virtio/block/virtio/mod.rs b/src/vmm/src/devices/virtio/block/virtio/mod.rs index 8ea59a5aba4..9e97d6d3897 100644 --- a/src/vmm/src/devices/virtio/block/virtio/mod.rs +++ b/src/vmm/src/devices/virtio/block/virtio/mod.rs @@ -57,8 +57,8 @@ pub enum VirtioBlockError { BackingFile(std::io::Error, String), /// Error opening eventfd: {0} EventFd(std::io::Error), - /// Error creating an irqfd: {0} - IrqTrigger(std::io::Error), + /// Error creating an interrupt: {0} + Interrupt(std::io::Error), /// Error coming from the rate limiter: {0} RateLimiter(std::io::Error), /// Persistence error: {0} diff --git a/src/vmm/src/devices/virtio/block/virtio/test_utils.rs b/src/vmm/src/devices/virtio/block/virtio/test_utils.rs index 14e2f1d33d0..e4f23c6a038 100644 --- a/src/vmm/src/devices/virtio/block/virtio/test_utils.rs +++ b/src/vmm/src/devices/virtio/block/virtio/test_utils.rs @@ -21,7 +21,7 @@ use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::queue::{Queue, VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; use crate::devices::virtio::test_utils::{VirtQueue, VirtqDesc}; #[cfg(test)] -use crate::devices::virtio::transport::mmio::IrqType; +use crate::devices::virtio::transport::VirtioInterruptType; use crate::rate_limiter::RateLimiter; use crate::vmm_config::{RateLimiterConfig, TokenBucketConfig}; use crate::vstate::memory::{Bytes, GuestAddress}; @@ -79,13 +79,15 @@ pub fn rate_limiter(blk: &mut VirtioBlock) -> &RateLimiter { #[cfg(test)] pub fn simulate_queue_event(b: &mut VirtioBlock, maybe_expected_irq: Option) { // Trigger the queue event. + b.queue_evts[0].write(1).unwrap(); // Handle event. b.process_queue_event(); // Validate the queue operation finished successfully. if let Some(expected_irq) = maybe_expected_irq { assert_eq!( - b.interrupt_trigger().has_pending_irq(IrqType::Vring), + b.interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(0)), expected_irq ); } @@ -104,7 +106,8 @@ pub fn simulate_async_completion_event(b: &mut VirtioBlock, expected_irq: bool) // Validate if there are pending IRQs. assert_eq!( - b.interrupt_trigger().has_pending_irq(IrqType::Vring), + b.interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(0)), expected_irq ); } diff --git a/src/vmm/src/devices/virtio/device.rs b/src/vmm/src/devices/virtio/device.rs index efcdd7170c5..0b09195d8f7 100644 --- a/src/vmm/src/devices/virtio/device.rs +++ b/src/vmm/src/devices/virtio/device.rs @@ -13,7 +13,7 @@ use vmm_sys_util::eventfd::EventFd; use super::ActivateError; use super::queue::{Queue, QueueError}; -use super::transport::mmio::IrqTrigger; +use super::transport::VirtioInterrupt; use crate::devices::virtio::AsAny; use crate::logger::warn; use crate::vstate::memory::GuestMemoryMmap; @@ -22,7 +22,7 @@ use crate::vstate::memory::GuestMemoryMmap; #[derive(Debug, Clone)] pub struct ActiveState { pub mem: GuestMemoryMmap, - pub interrupt: Arc, + pub interrupt: Arc, } /// Enum that indicates if a VirtioDevice is inactive or has been activated @@ -88,10 +88,10 @@ pub trait VirtioDevice: AsAny + Send { /// Returns the current device interrupt status. fn interrupt_status(&self) -> Arc { - Arc::clone(&self.interrupt_trigger().irq_status) + self.interrupt_trigger().status() } - fn interrupt_trigger(&self) -> &IrqTrigger; + fn interrupt_trigger(&self) -> &dyn VirtioInterrupt; /// The set of feature bits shifted by `page * 32`. fn avail_features_by_page(&self, page: u32) -> u32 { @@ -140,7 +140,7 @@ pub trait VirtioDevice: AsAny + Send { fn activate( &mut self, mem: GuestMemoryMmap, - interrupt: Arc, + interrupt: Arc, ) -> Result<(), ActivateError>; /// Checks if the resources of this device are activated. @@ -205,7 +205,7 @@ pub(crate) mod tests { todo!() } - fn interrupt_trigger(&self) -> &IrqTrigger { + fn interrupt_trigger(&self) -> &dyn VirtioInterrupt { todo!() } @@ -220,7 +220,7 @@ pub(crate) mod tests { fn activate( &mut self, _mem: GuestMemoryMmap, - _interrupt: Arc, + _interrupt: Arc, ) -> Result<(), ActivateError> { todo!() } diff --git a/src/vmm/src/devices/virtio/net/device.rs b/src/vmm/src/devices/virtio/net/device.rs index 9949b404809..cf9f445d5df 100755 --- a/src/vmm/src/devices/virtio/net/device.rs +++ b/src/vmm/src/devices/virtio/net/device.rs @@ -8,6 +8,7 @@ use std::collections::VecDeque; use std::mem::{self}; use std::net::Ipv4Addr; +use std::ops::Deref; use std::sync::{Arc, Mutex}; use libc::{EAGAIN, iovec}; @@ -32,7 +33,7 @@ use crate::devices::virtio::net::{ MAX_BUFFER_SIZE, NET_QUEUE_SIZES, NetError, NetQueue, RX_INDEX, TX_INDEX, generated, }; use crate::devices::virtio::queue::{DescriptorChain, InvalidAvailIdx, Queue}; -use crate::devices::virtio::transport::mmio::{IrqTrigger, IrqType}; +use crate::devices::virtio::transport::{VirtioInterrupt, VirtioInterruptType}; use crate::devices::virtio::{ActivateError, TYPE_NET}; use crate::devices::{DeviceError, report_net_event_fail}; use crate::dumbo::pdu::arp::ETH_IPV4_FRAME_LEN; @@ -390,15 +391,15 @@ impl Net { /// https://docs.oasis-open.org/virtio/virtio/v1.1/csprd01/virtio-v1.1-csprd01.html#x1-320005 /// 2.6.7.1 Driver Requirements: Used Buffer Notification Suppression fn try_signal_queue(&mut self, queue_type: NetQueue) -> Result<(), DeviceError> { - let queue = match queue_type { - NetQueue::Rx => &mut self.queues[RX_INDEX], - NetQueue::Tx => &mut self.queues[TX_INDEX], + let qidx = match queue_type { + NetQueue::Rx => RX_INDEX, + NetQueue::Tx => TX_INDEX, }; - queue.advance_used_ring_idx(); + self.queues[qidx].advance_used_ring_idx(); - if queue.prepare_kick() { + if self.queues[qidx].prepare_kick() { self.interrupt_trigger() - .trigger_irq(IrqType::Vring) + .trigger(VirtioInterruptType::Queue(qidx.try_into().unwrap())) .map_err(|err| { self.metrics.event_fails.inc(); DeviceError::FailedSignalingIrq(err) @@ -966,12 +967,12 @@ impl VirtioDevice for Net { &self.queue_evts } - fn interrupt_trigger(&self) -> &IrqTrigger { - &self - .device_state + fn interrupt_trigger(&self) -> &dyn VirtioInterrupt { + self.device_state .active_state() .expect("Device is not implemented") .interrupt + .deref() } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -1005,7 +1006,7 @@ impl VirtioDevice for Net { fn activate( &mut self, mem: GuestMemoryMmap, - interrupt: Arc, + interrupt: Arc, ) -> Result<(), ActivateError> { for q in self.queues.iter_mut() { q.initialize(&mem) @@ -1066,7 +1067,6 @@ pub mod tests { }; use crate::devices::virtio::queue::VIRTQ_DESC_F_WRITE; use crate::devices::virtio::test_utils::VirtQueue; - use crate::devices::virtio::transport::mmio::IrqType; use crate::dumbo::EthernetFrame; use crate::dumbo::pdu::arp::{ETH_IPV4_FRAME_LEN, EthIPv4ArpFrame}; use crate::dumbo::pdu::ethernet::ETHERTYPE_ARP; @@ -1409,7 +1409,12 @@ pub mod tests { // Check that the used queue has advanced. assert_eq!(th.rxq.used.idx.get(), 4); - assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(RX_INDEX as u16)) + ); + // Check that the invalid descriptor chains have been discarded th.rxq.check_used_elem(0, 0, 0); th.rxq.check_used_elem(1, 3, 0); @@ -1466,7 +1471,11 @@ pub mod tests { assert!(th.net().rx_buffer.used_descriptors == 0); // Check that the used queue has advanced. assert_eq!(th.rxq.used.idx.get(), 1); - assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(RX_INDEX as u16)) + ); // Check that the frame has been written successfully to the Rx descriptor chain. header_set_num_buffers(frame.as_mut_slice(), 1); th.rxq @@ -1529,7 +1538,11 @@ pub mod tests { assert!(th.net().rx_buffer.used_bytes == 0); // Check that the used queue has advanced. assert_eq!(th.rxq.used.idx.get(), 2); - assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(RX_INDEX as u16)) + ); // Check that the 1st frame was written successfully to the 1st Rx descriptor chain. header_set_num_buffers(frame_1.as_mut_slice(), 1); th.rxq @@ -1587,7 +1600,11 @@ pub mod tests { assert!(th.net().rx_buffer.used_bytes == 0); // Check that the used queue has advanced. assert_eq!(th.rxq.used.idx.get(), 2); - assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(RX_INDEX as u16)) + ); // 2 chains should be used for the packet. header_set_num_buffers(frame.as_mut_slice(), 2); @@ -1652,7 +1669,11 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(TX_INDEX as u16)) + ); th.txq.check_used_elem(0, 0, 0); // Check that the frame was skipped. assert!(!tap_traffic_simulator.pop_rx_packet(&mut [])); @@ -1675,7 +1696,11 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(TX_INDEX as u16)) + ); th.txq.check_used_elem(0, 0, 0); // Check that the frame was skipped. assert!(!tap_traffic_simulator.pop_rx_packet(&mut [])); @@ -1702,7 +1727,11 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(TX_INDEX as u16)) + ); th.txq.check_used_elem(0, 0, 0); // Check that the frame was skipped. assert!(!tap_traffic_simulator.pop_rx_packet(&mut [])); @@ -1725,7 +1754,11 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(TX_INDEX as u16)) + ); th.txq.check_used_elem(0, 0, 0); // Check that the frame was skipped. assert!(!tap_traffic_simulator.pop_rx_packet(&mut [])); @@ -1764,7 +1797,11 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 4); - assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(TX_INDEX as u16)) + ); th.txq.check_used_elem(3, 4, 0); // Check that the valid frame was sent to the tap. let mut buf = vec![0; 1000]; @@ -1795,7 +1832,11 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(TX_INDEX as u16)) + ); th.txq.check_used_elem(0, 3, 0); // Check that the frame was sent to the tap. let mut buf = vec![0; 1000]; @@ -1824,7 +1865,11 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(TX_INDEX as u16)) + ); th.txq.check_used_elem(0, 0, 0); // dropping th would double close the tap fd, so leak it @@ -1855,7 +1900,11 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 2); - assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(TX_INDEX as u16)) + ); th.txq.check_used_elem(0, 0, 0); th.txq.check_used_elem(1, 3, 0); // Check that the first frame was sent to the tap. @@ -2207,7 +2256,11 @@ pub mod tests { assert_eq!(th.net().metrics.rx_rate_limiter_throttled.count(), 1); assert!(th.net().rx_buffer.used_descriptors != 0); // assert that no operation actually completed (limiter blocked it) - assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(RX_INDEX as u16)) + ); // make sure the data is still queued for processing assert_eq!(th.rxq.used.idx.get(), 0); } @@ -2235,7 +2288,11 @@ pub mod tests { // validate the rate_limiter is no longer blocked assert!(!th.net().rx_rate_limiter.is_blocked()); // make sure the virtio queue operation completed this time - assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(RX_INDEX as u16)) + ); // make sure the data queue advanced assert_eq!(th.rxq.used.idx.get(), 1); th.rxq @@ -2332,14 +2389,22 @@ pub mod tests { assert!(th.net().metrics.rx_rate_limiter_throttled.count() >= 1); assert!(th.net().rx_buffer.used_descriptors != 0); // assert that no operation actually completed (limiter blocked it) - assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(RX_INDEX as u16)) + ); // make sure the data is still queued for processing assert_eq!(th.rxq.used.idx.get(), 0); // trigger the RX handler again, this time it should do the limiter fast path exit th.simulate_event(NetEvent::Tap); // assert that no operation actually completed, that the limiter blocked it - assert!(!th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); + assert!( + !th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(RX_INDEX as u16)) + ); // make sure the data is still queued for processing assert_eq!(th.rxq.used.idx.get(), 0); } @@ -2352,7 +2417,11 @@ pub mod tests { { th.simulate_event(NetEvent::RxRateLimiter); // make sure the virtio queue operation completed this time - assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(RX_INDEX as u16)) + ); // make sure the data queue advanced assert_eq!(th.rxq.used.idx.get(), 1); th.rxq @@ -2422,7 +2491,14 @@ pub mod tests { assert_eq!(net.queue_events().len(), NET_QUEUE_SIZES.len()); // Test interrupts. - assert!(!net.interrupt_trigger().has_pending_irq(IrqType::Vring)); + assert!( + !net.interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(RX_INDEX as u16)) + ); + assert!( + !net.interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(TX_INDEX as u16)) + ); } #[test] diff --git a/src/vmm/src/devices/virtio/net/persist.rs b/src/vmm/src/devices/virtio/net/persist.rs index 9072d3dd5e7..5ebd15f9d54 100644 --- a/src/vmm/src/devices/virtio/net/persist.rs +++ b/src/vmm/src/devices/virtio/net/persist.rs @@ -13,7 +13,7 @@ use super::{NET_NUM_QUEUES, NET_QUEUE_MAX_SIZE, RX_INDEX, TapError}; use crate::devices::virtio::TYPE_NET; use crate::devices::virtio::device::{ActiveState, DeviceState}; use crate::devices::virtio::persist::{PersistError as VirtioStateError, VirtioDeviceState}; -use crate::devices::virtio::transport::mmio::IrqTrigger; +use crate::devices::virtio::transport::VirtioInterrupt; use crate::mmds::data_store::Mmds; use crate::mmds::ns::MmdsNetworkStack; use crate::mmds::persist::MmdsNetworkStackState; @@ -72,7 +72,7 @@ pub struct NetConstructorArgs { /// Pointer to guest memory. pub mem: GuestMemoryMmap, /// Interrupt for the device. - pub interrupt: Arc, + pub interrupt: Arc, /// Pointer to the MMDS data store. pub mmds: Option>>, } diff --git a/src/vmm/src/devices/virtio/net/test_utils.rs b/src/vmm/src/devices/virtio/net/test_utils.rs index c81ad58205c..b4fbdf97e3f 100644 --- a/src/vmm/src/devices/virtio/net/test_utils.rs +++ b/src/vmm/src/devices/virtio/net/test_utils.rs @@ -300,7 +300,7 @@ pub mod test { use crate::devices::virtio::net::{MAX_BUFFER_SIZE, Net, RX_INDEX, TX_INDEX}; use crate::devices::virtio::queue::{VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; use crate::devices::virtio::test_utils::{VirtQueue, VirtqDesc, default_interrupt}; - use crate::devices::virtio::transport::mmio::IrqType; + use crate::devices::virtio::transport::VirtioInterruptType; use crate::logger::IncMetric; use crate::vstate::memory::{Address, Bytes, GuestAddress, GuestMemoryMmap}; @@ -444,7 +444,7 @@ pub mod test { assert!( self.net() .interrupt_trigger() - .has_pending_irq(IrqType::Vring) + .has_pending_interrupt(VirtioInterruptType::Queue(RX_INDEX as u16)) ); frame @@ -474,7 +474,7 @@ pub mod test { assert!( self.net() .interrupt_trigger() - .has_pending_irq(IrqType::Vring) + .has_pending_interrupt(VirtioInterruptType::Queue(RX_INDEX as u16)) ); self.rxq .check_used_elem(used_idx, 0, expected_frame.len().try_into().unwrap()); diff --git a/src/vmm/src/devices/virtio/rng/device.rs b/src/vmm/src/devices/virtio/rng/device.rs index 1433a7086e2..a0b98cdc8b7 100644 --- a/src/vmm/src/devices/virtio/rng/device.rs +++ b/src/vmm/src/devices/virtio/rng/device.rs @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use std::io; +use std::ops::Deref; use std::sync::Arc; use aws_lc_rs::rand; @@ -16,7 +17,7 @@ use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; use crate::devices::virtio::iov_deque::IovDequeError; use crate::devices::virtio::iovec::IoVecBufferMut; use crate::devices::virtio::queue::{FIRECRACKER_MAX_QUEUE_SIZE, InvalidAvailIdx, Queue}; -use crate::devices::virtio::transport::mmio::{IrqTrigger, IrqType}; +use crate::devices::virtio::transport::{VirtioInterrupt, VirtioInterruptType}; use crate::devices::virtio::{ActivateError, TYPE_RNG}; use crate::logger::{IncMetric, debug, error}; use crate::rate_limiter::{RateLimiter, TokenType}; @@ -87,7 +88,7 @@ impl Entropy { fn signal_used_queue(&self) -> Result<(), DeviceError> { self.interrupt_trigger() - .trigger_irq(IrqType::Vring) + .trigger(VirtioInterruptType::Queue(RNG_QUEUE.try_into().unwrap())) .map_err(DeviceError::FailedSignalingIrq) } @@ -236,7 +237,11 @@ impl Entropy { self.acked_features = features; } - pub(crate) fn set_activated(&mut self, mem: GuestMemoryMmap, interrupt: Arc) { + pub(crate) fn set_activated( + &mut self, + mem: GuestMemoryMmap, + interrupt: Arc, + ) { self.device_state = DeviceState::Activated(ActiveState { mem, interrupt }); } @@ -262,12 +267,12 @@ impl VirtioDevice for Entropy { &self.queue_events } - fn interrupt_trigger(&self) -> &IrqTrigger { - &self - .device_state + fn interrupt_trigger(&self) -> &dyn VirtioInterrupt { + self.device_state .active_state() .expect("Device is not initialized") .interrupt + .deref() } fn avail_features(&self) -> u64 { @@ -293,7 +298,7 @@ impl VirtioDevice for Entropy { fn activate( &mut self, mem: GuestMemoryMmap, - interrupt: Arc, + interrupt: Arc, ) -> Result<(), ActivateError> { for q in self.queues.iter_mut() { q.initialize(&mem) diff --git a/src/vmm/src/devices/virtio/rng/persist.rs b/src/vmm/src/devices/virtio/rng/persist.rs index dd2d62debee..75db947c9c7 100644 --- a/src/vmm/src/devices/virtio/rng/persist.rs +++ b/src/vmm/src/devices/virtio/rng/persist.rs @@ -11,7 +11,7 @@ use crate::devices::virtio::TYPE_RNG; use crate::devices::virtio::persist::{PersistError as VirtioStateError, VirtioDeviceState}; use crate::devices::virtio::queue::FIRECRACKER_MAX_QUEUE_SIZE; use crate::devices::virtio::rng::{Entropy, EntropyError, RNG_NUM_QUEUES}; -use crate::devices::virtio::transport::mmio::IrqTrigger; +use crate::devices::virtio::transport::VirtioInterrupt; use crate::rate_limiter::RateLimiter; use crate::rate_limiter::persist::RateLimiterState; use crate::snapshot::Persist; @@ -26,11 +26,11 @@ pub struct EntropyState { #[derive(Debug)] pub struct EntropyConstructorArgs { mem: GuestMemoryMmap, - interrupt: Arc, + interrupt: Arc, } impl EntropyConstructorArgs { - pub fn new(mem: GuestMemoryMmap, interrupt: Arc) -> Self { + pub fn new(mem: GuestMemoryMmap, interrupt: Arc) -> Self { Self { mem, interrupt } } } diff --git a/src/vmm/src/devices/virtio/test_utils.rs b/src/vmm/src/devices/virtio/test_utils.rs index 29fbdc5ec56..861394c1c7d 100644 --- a/src/vmm/src/devices/virtio/test_utils.rs +++ b/src/vmm/src/devices/virtio/test_utils.rs @@ -10,12 +10,12 @@ use std::sync::Arc; use std::sync::atomic::{AtomicUsize, Ordering}; use crate::devices::virtio::queue::Queue; +use crate::devices::virtio::transport::VirtioInterrupt; +use crate::devices::virtio::transport::mmio::IrqTrigger; use crate::test_utils::single_region_mem; use crate::utils::{align_up, u64_to_usize}; use crate::vstate::memory::{Address, Bytes, GuestAddress, GuestMemoryMmap}; -use super::transport::mmio::IrqTrigger; - #[macro_export] macro_rules! check_metric_after_block { ($metric:expr, $delta:expr, $block:expr) => {{ @@ -32,7 +32,7 @@ pub fn default_mem() -> GuestMemoryMmap { } /// Creates a default ['IrqTrigger'] interrupt for a VirtIO device. -pub fn default_interrupt() -> Arc { +pub fn default_interrupt() -> Arc { Arc::new(IrqTrigger::new()) } diff --git a/src/vmm/src/devices/virtio/transport/mmio.rs b/src/vmm/src/devices/virtio/transport/mmio.rs index 224f086fdbb..07cb03fbdbb 100644 --- a/src/vmm/src/devices/virtio/transport/mmio.rs +++ b/src/vmm/src/devices/virtio/transport/mmio.rs @@ -193,7 +193,7 @@ impl MmioTransport { let _ = self .locked_device() .interrupt_trigger() - .trigger_irq(IrqType::Config); + .trigger(VirtioInterruptType::Config); error!("Failed to activate virtio device: {}", err) } @@ -434,7 +434,7 @@ impl IrqTrigger { } } - pub fn trigger_irq(&self, irq_type: IrqType) -> Result<(), std::io::Error> { + fn trigger_irq(&self, irq_type: IrqType) -> Result<(), std::io::Error> { let irq = match irq_type { IrqType::Config => VIRTIO_MMIO_INT_CONFIG, IrqType::Vring => VIRTIO_MMIO_INT_VRING, @@ -453,6 +453,8 @@ impl IrqTrigger { #[cfg(test)] pub(crate) mod tests { + use std::ops::Deref; + use vmm_sys_util::eventfd::EventFd; use super::*; @@ -467,7 +469,7 @@ pub(crate) mod tests { pub(crate) struct DummyDevice { acked_features: u64, avail_features: u64, - interrupt_trigger: Option>, + interrupt_trigger: Option>, queue_evts: Vec, queues: Vec, device_activated: bool, @@ -526,10 +528,11 @@ pub(crate) mod tests { &self.queue_evts } - fn interrupt_trigger(&self) -> &IrqTrigger { + fn interrupt_trigger(&self) -> &dyn VirtioInterrupt { self.interrupt_trigger .as_ref() .expect("Device is not activated") + .deref() } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -545,7 +548,7 @@ pub(crate) mod tests { fn activate( &mut self, _: GuestMemoryMmap, - interrupt: Arc, + interrupt: Arc, ) -> Result<(), ActivateError> { self.device_activated = true; self.interrupt_trigger = Some(interrupt); @@ -985,7 +988,8 @@ pub(crate) mod tests { assert_eq!( d.locked_device() .interrupt_trigger() - .irq_evt + .notifier(VirtioInterruptType::Config) + .unwrap() .read() .unwrap(), 1 @@ -1089,25 +1093,6 @@ pub(crate) mod tests { assert_eq!(dummy_dev.acked_features(), 24); } - impl IrqTrigger { - pub fn has_pending_irq(&self, irq_type: IrqType) -> bool { - if let Ok(num_irqs) = self.irq_evt.read() { - if num_irqs == 0 { - return false; - } - - let irq_status = self.irq_status.load(Ordering::SeqCst); - return matches!( - (irq_status, irq_type), - (VIRTIO_MMIO_INT_CONFIG, IrqType::Config) - | (VIRTIO_MMIO_INT_VRING, IrqType::Vring) - ); - } - - false - } - } - #[test] fn irq_trigger() { let irq_trigger = IrqTrigger::new(); diff --git a/src/vmm/src/devices/virtio/vhost_user.rs b/src/vmm/src/devices/virtio/vhost_user.rs index 4f895e5c05e..556a8adafaf 100644 --- a/src/vmm/src/devices/virtio/vhost_user.rs +++ b/src/vmm/src/devices/virtio/vhost_user.rs @@ -6,6 +6,7 @@ use std::os::fd::AsRawFd; use std::os::unix::net::UnixStream; +use std::sync::Arc; use vhost::vhost_user::message::*; use vhost::vhost_user::{Frontend, VhostUserFrontend}; @@ -14,7 +15,7 @@ use vm_memory::{Address, Error as MmapError, GuestMemory, GuestMemoryError, Gues use vmm_sys_util::eventfd::EventFd; use crate::devices::virtio::queue::Queue; -use crate::devices::virtio::transport::mmio::IrqTrigger; +use crate::devices::virtio::transport::{VirtioInterrupt, VirtioInterruptType}; use crate::vstate::memory::GuestMemoryMmap; /// vhost-user error. @@ -400,7 +401,7 @@ impl VhostUserHandleImpl { &mut self, mem: &GuestMemoryMmap, queues: &[(usize, &Queue, &EventFd)], - irq_trigger: &IrqTrigger, + interrupt: Arc, ) -> Result<(), VhostUserError> { // Provide the memory table to the backend. self.update_mem_table(mem)?; @@ -442,7 +443,17 @@ impl VhostUserHandleImpl { // No matter the queue, we set irq_evt for signaling the guest that buffers were // consumed. self.vu - .set_vring_call(*queue_index, &irq_trigger.irq_evt) + .set_vring_call( + *queue_index, + interrupt + .notifier(VirtioInterruptType::Queue( + (*queue_index).try_into().unwrap_or_else(|_| { + panic!("vhost-user: invalid queue index: {}", *queue_index) + }), + )) + .as_ref() + .unwrap(), + ) .map_err(VhostUserError::VhostUserSetVringCall)?; self.vu @@ -467,6 +478,7 @@ pub(crate) mod tests { use vmm_sys_util::tempfile::TempFile; use super::*; + use crate::devices::virtio::test_utils::default_interrupt; use crate::test_utils::create_tmp_socket; use crate::vstate::memory; use crate::vstate::memory::GuestAddress; @@ -901,11 +913,11 @@ pub(crate) mod tests { queue.initialize(&guest_memory).unwrap(); let event_fd = EventFd::new(0).unwrap(); - let irq_trigger = IrqTrigger::new(); let queues = [(0, &queue, &event_fd)]; - vuh.setup_backend(&guest_memory, &queues, &irq_trigger) + let interrupt = default_interrupt(); + vuh.setup_backend(&guest_memory, &queues, interrupt.clone()) .unwrap(); // VhostUserHandleImpl should correctly send memory and queues information to @@ -929,7 +941,11 @@ pub(crate) mod tests { log_addr: None, }, base: queue.avail_ring_idx_get(), - call: irq_trigger.irq_evt.as_raw_fd(), + call: interrupt + .notifier(VirtioInterruptType::Queue(0u16)) + .as_ref() + .unwrap() + .as_raw_fd(), kick: event_fd.as_raw_fd(), enable: true, }; diff --git a/src/vmm/src/devices/virtio/vsock/device.rs b/src/vmm/src/devices/virtio/vsock/device.rs index ad049b517e4..61ca3246d43 100644 --- a/src/vmm/src/devices/virtio/vsock/device.rs +++ b/src/vmm/src/devices/virtio/vsock/device.rs @@ -21,6 +21,7 @@ //! - a backend FD. use std::fmt::Debug; +use std::ops::Deref; use std::sync::Arc; use log::{error, warn}; @@ -34,7 +35,7 @@ use crate::devices::virtio::ActivateError; use crate::devices::virtio::device::{ActiveState, DeviceState, VirtioDevice}; use crate::devices::virtio::generated::virtio_config::{VIRTIO_F_IN_ORDER, VIRTIO_F_VERSION_1}; use crate::devices::virtio::queue::{InvalidAvailIdx, Queue as VirtQueue}; -use crate::devices::virtio::transport::mmio::{IrqTrigger, IrqType}; +use crate::devices::virtio::transport::{VirtioInterrupt, VirtioInterruptType}; use crate::devices::virtio::vsock::VsockError; use crate::devices::virtio::vsock::metrics::METRICS; use crate::logger::IncMetric; @@ -136,12 +137,14 @@ where /// Signal the guest driver that we've used some virtio buffers that it had previously made /// available. - pub fn signal_used_queue(&self) -> Result<(), DeviceError> { + pub fn signal_used_queue(&self, qidx: usize) -> Result<(), DeviceError> { self.device_state .active_state() .expect("Device is not initialized") .interrupt - .trigger_irq(IrqType::Vring) + .trigger(VirtioInterruptType::Queue(qidx.try_into().unwrap_or_else( + |_| panic!("vsock: invalid queue index: {qidx}"), + ))) .map_err(DeviceError::FailedSignalingIrq) } @@ -259,7 +262,7 @@ where }); queue.advance_used_ring_idx(); - self.signal_used_queue()?; + self.signal_used_queue(EVQ_INDEX)?; Ok(()) } @@ -297,12 +300,12 @@ where &self.queue_events } - fn interrupt_trigger(&self) -> &IrqTrigger { - &self - .device_state + fn interrupt_trigger(&self) -> &dyn VirtioInterrupt { + self.device_state .active_state() .expect("Device is not initialized") .interrupt + .deref() } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -337,7 +340,7 @@ where fn activate( &mut self, mem: GuestMemoryMmap, - interrupt: Arc, + interrupt: Arc, ) -> Result<(), ActivateError> { for q in self.queues.iter_mut() { q.initialize(&mem) diff --git a/src/vmm/src/devices/virtio/vsock/event_handler.rs b/src/vmm/src/devices/virtio/vsock/event_handler.rs index 9c909048a69..a54998ba808 100755 --- a/src/vmm/src/devices/virtio/vsock/event_handler.rs +++ b/src/vmm/src/devices/virtio/vsock/event_handler.rs @@ -197,9 +197,10 @@ where Self::PROCESS_EVQ => raise_irq = self.handle_evq_event(evset), Self::PROCESS_NOTIFY_BACKEND => raise_irq = self.notify_backend(evset).unwrap(), _ => warn!("Unexpected vsock event received: {:?}", source), - } + }; if raise_irq { - self.signal_used_queue().unwrap_or_default(); + self.signal_used_queue(source as usize) + .expect("vsock: Could not trigger device interrupt"); } } else { warn!( diff --git a/src/vmm/src/devices/virtio/vsock/persist.rs b/src/vmm/src/devices/virtio/vsock/persist.rs index 3d0967926be..9d2fd61d9d5 100644 --- a/src/vmm/src/devices/virtio/vsock/persist.rs +++ b/src/vmm/src/devices/virtio/vsock/persist.rs @@ -12,7 +12,7 @@ use super::*; use crate::devices::virtio::device::{ActiveState, DeviceState}; use crate::devices::virtio::persist::VirtioDeviceState; use crate::devices::virtio::queue::FIRECRACKER_MAX_QUEUE_SIZE; -use crate::devices::virtio::transport::mmio::IrqTrigger; +use crate::devices::virtio::transport::VirtioInterrupt; use crate::devices::virtio::vsock::TYPE_VSOCK; use crate::snapshot::Persist; use crate::vstate::memory::GuestMemoryMmap; @@ -54,7 +54,7 @@ pub struct VsockConstructorArgs { /// Pointer to guest memory. pub mem: GuestMemoryMmap, /// Interrupt to use for the device. - pub interrupt: Arc, + pub interrupt: Arc, /// The vsock Unix Backend. pub backend: B, } diff --git a/src/vmm/src/devices/virtio/vsock/test_utils.rs b/src/vmm/src/devices/virtio/vsock/test_utils.rs index 56795e5fd36..b38ce070c66 100644 --- a/src/vmm/src/devices/virtio/vsock/test_utils.rs +++ b/src/vmm/src/devices/virtio/vsock/test_utils.rs @@ -14,7 +14,7 @@ use super::packet::{VsockPacketRx, VsockPacketTx}; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::queue::{VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; use crate::devices::virtio::test_utils::{VirtQueue as GuestQ, default_interrupt}; -use crate::devices::virtio::transport::mmio::IrqTrigger; +use crate::devices::virtio::transport::VirtioInterrupt; use crate::devices::virtio::vsock::device::{RXQ_INDEX, TXQ_INDEX}; use crate::devices::virtio::vsock::packet::VSOCK_PKT_HDR_SIZE; use crate::devices::virtio::vsock::{ @@ -119,7 +119,7 @@ impl VsockBackend for TestBackend {} pub struct TestContext { pub cid: u64, pub mem: GuestMemoryMmap, - pub interrupt: Arc, + pub interrupt: Arc, pub mem_size: usize, pub device: Vsock, } @@ -200,7 +200,7 @@ pub struct EventHandlerContext<'a> { } impl EventHandlerContext<'_> { - pub fn mock_activate(&mut self, mem: GuestMemoryMmap, interrupt: Arc) { + pub fn mock_activate(&mut self, mem: GuestMemoryMmap, interrupt: Arc) { // Artificially activate the device. self.device.activate(mem, interrupt).unwrap(); } From e7ba499728bff6eba88a1a21a21493fe268279d2 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 29 Apr 2025 14:58:54 +0200 Subject: [PATCH 06/99] vm-device: add vm-device crate to repo Bring in the vm-device crate from CloudHypervisor. We will be using it for adding PCIe support. Signed-off-by: Babis Chalios --- Cargo.lock | 8 + src/vm-device/Cargo.toml | 16 ++ src/vm-device/src/bus.rs | 407 +++++++++++++++++++++++++++ src/vm-device/src/dma_mapping/mod.rs | 18 ++ src/vm-device/src/interrupt/mod.rs | 194 +++++++++++++ src/vm-device/src/lib.rs | 63 +++++ src/vmm/Cargo.toml | 10 +- 7 files changed, 714 insertions(+), 2 deletions(-) create mode 100644 src/vm-device/Cargo.toml create mode 100644 src/vm-device/src/bus.rs create mode 100644 src/vm-device/src/dma_mapping/mod.rs create mode 100644 src/vm-device/src/interrupt/mod.rs create mode 100644 src/vm-device/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 6e2ae545440..044ab7bc025 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1585,6 +1585,14 @@ dependencies = [ "thiserror 2.0.12", ] +[[package]] +name = "vm-device" +version = "0.1.0" +dependencies = [ + "serde", + "vmm-sys-util", +] + [[package]] name = "vm-fdt" version = "0.3.0" diff --git a/src/vm-device/Cargo.toml b/src/vm-device/Cargo.toml new file mode 100644 index 00000000000..b6471ab9f6a --- /dev/null +++ b/src/vm-device/Cargo.toml @@ -0,0 +1,16 @@ +[package] +authors = ["The Cloud Hypervisor Authors"] +edition = "2021" +name = "vm-device" +version = "0.1.0" +license = "Apache-2.0 AND BSD-3-Clause" + +[lib] +bench = false + +[features] +default = [] + +[dependencies] +serde = { version = "1.0.208", features = ["derive", "rc"] } +vmm-sys-util = { version = "0.14.0", features = ["with-serde"] } diff --git a/src/vm-device/src/bus.rs b/src/vm-device/src/bus.rs new file mode 100644 index 00000000000..31880d354bb --- /dev/null +++ b/src/vm-device/src/bus.rs @@ -0,0 +1,407 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. + +//! Handles routing to devices in an address space. + +use std::cmp::Ordering; +use std::collections::btree_map::BTreeMap; +use std::sync::{Arc, Barrier, Mutex, RwLock, Weak}; +use std::{convert, error, fmt, io, result}; + +/// Trait for devices that respond to reads or writes in an arbitrary address space. +/// +/// The device does not care where it exists in address space as each method is only given an offset +/// into its allocated portion of address space. +#[allow(unused_variables)] +pub trait BusDevice: Send { + /// Reads at `offset` from this device + fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) {} + /// Writes at `offset` into this device + fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option> { + None + } +} + +#[allow(unused_variables)] +pub trait BusDeviceSync: Send + Sync { + /// Reads at `offset` from this device + fn read(&self, base: u64, offset: u64, data: &mut [u8]) {} + /// Writes at `offset` into this device + fn write(&self, base: u64, offset: u64, data: &[u8]) -> Option> { + None + } +} + +impl BusDeviceSync for Mutex { + /// Reads at `offset` from this device + fn read(&self, base: u64, offset: u64, data: &mut [u8]) { + self.lock() + .expect("Failed to acquire device lock") + .read(base, offset, data) + } + /// Writes at `offset` into this device + fn write(&self, base: u64, offset: u64, data: &[u8]) -> Option> { + self.lock() + .expect("Failed to acquire device lock") + .write(base, offset, data) + } +} + +#[derive(Debug)] +pub enum Error { + /// The insertion failed because the new device overlapped with an old device. + Overlap, + /// Failed to operate on zero sized range. + ZeroSizedRange, + /// Failed to find address range. + MissingAddressRange, +} + +pub type Result = result::Result; + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "bus_error: {self:?}") + } +} + +impl error::Error for Error {} + +impl convert::From for io::Error { + fn from(e: Error) -> Self { + io::Error::other(e) + } +} + +/// Holds a base and length representing the address space occupied by a `BusDevice`. +/// +/// * base - The address at which the range start. +/// * len - The length of the range in bytes. +#[derive(Debug, Copy, Clone)] +pub struct BusRange { + pub base: u64, + pub len: u64, +} + +impl BusRange { + /// Returns true if there is overlap with the given range. + pub fn overlaps(&self, base: u64, len: u64) -> bool { + self.base < (base + len) && base < self.base + self.len + } +} + +impl Eq for BusRange {} + +impl PartialEq for BusRange { + fn eq(&self, other: &BusRange) -> bool { + self.base == other.base + } +} + +impl Ord for BusRange { + fn cmp(&self, other: &BusRange) -> Ordering { + self.base.cmp(&other.base) + } +} + +impl PartialOrd for BusRange { + fn partial_cmp(&self, other: &BusRange) -> Option { + Some(self.cmp(other)) + } +} + +/// A device container for routing reads and writes over some address space. +/// +/// This doesn't have any restrictions on what kind of device or address space this applies to. The +/// only restriction is that no two devices can overlap in this address space. +#[derive(Default, Debug)] +pub struct Bus { + devices: RwLock>>, +} + +impl Bus { + /// Constructs an a bus with an empty address space. + pub fn new() -> Bus { + Bus { + devices: RwLock::new(BTreeMap::new()), + } + } + + fn first_before(&self, addr: u64) -> Option<(BusRange, Arc)> { + let devices = self.devices.read().unwrap(); + let (range, dev) = devices + .range(..=BusRange { base: addr, len: 1 }) + .next_back()?; + dev.upgrade().map(|d| (*range, d.clone())) + } + + #[allow(clippy::type_complexity)] + pub fn resolve(&self, addr: u64) -> Option<(u64, u64, Arc)> { + if let Some((range, dev)) = self.first_before(addr) { + let offset = addr - range.base; + if offset < range.len { + return Some((range.base, offset, dev)); + } + } + None + } + + pub fn insert(&self, device: Arc, base: u64, len: u64) -> Result<()> { + if len == 0 { + return Err(Error::ZeroSizedRange); + } + + // Reject all cases where the new device's range overlaps with an existing device. + if self + .devices + .read() + .unwrap() + .iter() + .any(|(range, _dev)| range.overlaps(base, len)) + { + return Err(Error::Overlap); + } + + if self + .devices + .write() + .unwrap() + .insert(BusRange { base, len }, Arc::downgrade(&device)) + .is_some() + { + return Err(Error::Overlap); + } + + Ok(()) + } + + /// Removes the device at the given address space range. + pub fn remove(&self, base: u64, len: u64) -> Result<()> { + if len == 0 { + return Err(Error::ZeroSizedRange); + } + + let bus_range = BusRange { base, len }; + + if self.devices.write().unwrap().remove(&bus_range).is_none() { + return Err(Error::MissingAddressRange); + } + + Ok(()) + } + + /// Removes all entries referencing the given device. + pub fn remove_by_device(&self, device: &Arc) -> Result<()> { + let mut device_list = self.devices.write().unwrap(); + let mut remove_key_list = Vec::new(); + + for (key, value) in device_list.iter() { + if Arc::ptr_eq(&value.upgrade().unwrap(), device) { + remove_key_list.push(*key); + } + } + + for key in remove_key_list.iter() { + device_list.remove(key); + } + + Ok(()) + } + + /// Updates the address range for an existing device. + pub fn update_range( + &self, + old_base: u64, + old_len: u64, + new_base: u64, + new_len: u64, + ) -> Result<()> { + // Retrieve the device corresponding to the range + let device = if let Some((_, _, dev)) = self.resolve(old_base) { + dev.clone() + } else { + return Err(Error::MissingAddressRange); + }; + + // Remove the old address range + self.remove(old_base, old_len)?; + + // Insert the new address range + self.insert(device, new_base, new_len) + } + + /// Reads data from the device that owns the range containing `addr` and puts it into `data`. + /// + /// Returns true on success, otherwise `data` is untouched. + pub fn read(&self, addr: u64, data: &mut [u8]) -> Result<()> { + if let Some((base, offset, dev)) = self.resolve(addr) { + // OK to unwrap as lock() failing is a serious error condition and should panic. + dev.read(base, offset, data); + Ok(()) + } else { + Err(Error::MissingAddressRange) + } + } + + /// Writes `data` to the device that owns the range containing `addr`. + /// + /// Returns true on success, otherwise `data` is untouched. + pub fn write(&self, addr: u64, data: &[u8]) -> Result>> { + if let Some((base, offset, dev)) = self.resolve(addr) { + // OK to unwrap as lock() failing is a serious error condition and should panic. + Ok(dev.write(base, offset, data)) + } else { + Err(Error::MissingAddressRange) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + struct DummyDevice; + impl BusDeviceSync for DummyDevice {} + + struct ConstantDevice; + impl BusDeviceSync for ConstantDevice { + fn read(&self, _base: u64, offset: u64, data: &mut [u8]) { + for (i, v) in data.iter_mut().enumerate() { + *v = (offset as u8) + (i as u8); + } + } + + fn write(&self, _base: u64, offset: u64, data: &[u8]) -> Option> { + for (i, v) in data.iter().enumerate() { + assert_eq!(*v, (offset as u8) + (i as u8)) + } + + None + } + } + + #[test] + fn bus_insert() { + let bus = Bus::new(); + let dummy = Arc::new(DummyDevice); + bus.insert(dummy.clone(), 0x10, 0).unwrap_err(); + bus.insert(dummy.clone(), 0x10, 0x10).unwrap(); + + let result = bus.insert(dummy.clone(), 0x0f, 0x10); + assert_eq!(format!("{result:?}"), "Err(Overlap)"); + + bus.insert(dummy.clone(), 0x10, 0x10).unwrap_err(); + bus.insert(dummy.clone(), 0x10, 0x15).unwrap_err(); + bus.insert(dummy.clone(), 0x12, 0x15).unwrap_err(); + bus.insert(dummy.clone(), 0x12, 0x01).unwrap_err(); + bus.insert(dummy.clone(), 0x0, 0x20).unwrap_err(); + bus.insert(dummy.clone(), 0x20, 0x05).unwrap(); + bus.insert(dummy.clone(), 0x25, 0x05).unwrap(); + bus.insert(dummy, 0x0, 0x10).unwrap(); + } + + #[test] + fn bus_remove() { + let bus = Bus::new(); + let dummy: Arc = Arc::new(DummyDevice); + + bus.remove(0x42, 0x0).unwrap_err(); + + bus.remove(0x13, 0x12).unwrap_err(); + + bus.insert(dummy.clone(), 0x13, 0x12).unwrap(); + bus.remove(0x42, 0x42).unwrap_err(); + bus.remove(0x13, 0x12).unwrap(); + + bus.insert(dummy.clone(), 0x16, 0x1).unwrap(); + bus.remove_by_device(&dummy).unwrap(); + bus.remove(0x16, 0x1).unwrap_err(); + } + + #[test] + #[allow(clippy::redundant_clone)] + fn bus_read_write() { + let bus = Bus::new(); + let dummy = Arc::new(DummyDevice); + bus.insert(dummy.clone(), 0x10, 0x10).unwrap(); + bus.read(0x10, &mut [0, 0, 0, 0]).unwrap(); + bus.write(0x10, &[0, 0, 0, 0]).unwrap(); + bus.read(0x11, &mut [0, 0, 0, 0]).unwrap(); + bus.write(0x11, &[0, 0, 0, 0]).unwrap(); + bus.read(0x16, &mut [0, 0, 0, 0]).unwrap(); + bus.write(0x16, &[0, 0, 0, 0]).unwrap(); + bus.read(0x20, &mut [0, 0, 0, 0]).unwrap_err(); + bus.write(0x20, &[0, 0, 0, 0]).unwrap_err(); + bus.read(0x06, &mut [0, 0, 0, 0]).unwrap_err(); + bus.write(0x06, &[0, 0, 0, 0]).unwrap_err(); + } + + #[test] + #[allow(clippy::redundant_clone)] + fn bus_read_write_values() { + let bus = Bus::new(); + let dummy = Arc::new(ConstantDevice); + bus.insert(dummy.clone(), 0x10, 0x10).unwrap(); + + let mut values = [0, 1, 2, 3]; + bus.read(0x10, &mut values).unwrap(); + assert_eq!(values, [0, 1, 2, 3]); + bus.write(0x10, &values).unwrap(); + bus.read(0x15, &mut values).unwrap(); + assert_eq!(values, [5, 6, 7, 8]); + bus.write(0x15, &values).unwrap(); + } + + #[test] + #[allow(clippy::redundant_clone)] + fn busrange_cmp() { + let range = BusRange { base: 0x10, len: 2 }; + assert_eq!(range, BusRange { base: 0x10, len: 3 }); + assert_eq!(range, BusRange { base: 0x10, len: 2 }); + + assert!(range < BusRange { base: 0x12, len: 1 }); + assert!(range < BusRange { base: 0x12, len: 3 }); + + assert_eq!(range, range.clone()); + + let bus = Bus::new(); + let mut data = [1, 2, 3, 4]; + let device = Arc::new(DummyDevice); + bus.insert(device.clone(), 0x10, 0x10).unwrap(); + bus.write(0x10, &data).unwrap(); + bus.read(0x10, &mut data).unwrap(); + assert_eq!(data, [1, 2, 3, 4]); + } + + #[test] + fn bus_range_overlap() { + let a = BusRange { + base: 0x1000, + len: 0x400, + }; + assert!(a.overlaps(0x1000, 0x400)); + assert!(a.overlaps(0xf00, 0x400)); + assert!(a.overlaps(0x1000, 0x01)); + assert!(a.overlaps(0xfff, 0x02)); + assert!(a.overlaps(0x1100, 0x100)); + assert!(a.overlaps(0x13ff, 0x100)); + assert!(!a.overlaps(0x1400, 0x100)); + assert!(!a.overlaps(0xf00, 0x100)); + } + + #[test] + fn bus_update_range() { + let bus = Bus::new(); + let dummy = Arc::new(DummyDevice); + + bus.update_range(0x13, 0x12, 0x16, 0x1).unwrap_err(); + bus.insert(dummy.clone(), 0x13, 12).unwrap(); + + bus.update_range(0x16, 0x1, 0x13, 0x12).unwrap_err(); + bus.update_range(0x13, 0x12, 0x16, 0x1).unwrap(); + } +} diff --git a/src/vm-device/src/dma_mapping/mod.rs b/src/vm-device/src/dma_mapping/mod.rs new file mode 100644 index 00000000000..6cba6e16488 --- /dev/null +++ b/src/vm-device/src/dma_mapping/mod.rs @@ -0,0 +1,18 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// Copyright © 2021 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause + +/// Trait to trigger DMA mapping updates for devices managed by virtio-iommu +/// +/// Trait meant for triggering the DMA mapping update related to an external +/// device not managed fully through virtio. It is dedicated to virtio-iommu +/// in order to trigger the map update anytime the mapping is updated from the +/// guest. +pub trait ExternalDmaMapping: Send + Sync { + /// Map a memory range + fn map(&self, iova: u64, gpa: u64, size: u64) -> std::result::Result<(), std::io::Error>; + + /// Unmap a memory range + fn unmap(&self, iova: u64, size: u64) -> std::result::Result<(), std::io::Error>; +} diff --git a/src/vm-device/src/interrupt/mod.rs b/src/vm-device/src/interrupt/mod.rs new file mode 100644 index 00000000000..f4aec52a2e0 --- /dev/null +++ b/src/vm-device/src/interrupt/mod.rs @@ -0,0 +1,194 @@ +// Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// Copyright (C) 2019 Alibaba Cloud. All rights reserved. +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause + +//! Traits and Structs to manage interrupt sources for devices. +//! +//! In system programming, an interrupt is a signal to the processor emitted by hardware or +//! software indicating an event that needs immediate attention. An interrupt alerts the processor +//! to a high-priority condition requiring the interruption of the current code the processor is +//! executing. The processor responds by suspending its current activities, saving its state, and +//! executing a function called an interrupt handler (or an interrupt service routine, ISR) to deal +//! with the event. This interruption is temporary, and, after the interrupt handler finishes, +//! unless handling the interrupt has emitted a fatal error, the processor resumes normal +//! activities. +//! +//! Hardware interrupts are used by devices to communicate that they require attention from the +//! operating system, or a bare-metal program running on the CPU if there are no OSes. The act of +//! initiating a hardware interrupt is referred to as an interrupt request (IRQ). Different devices +//! are usually associated with different interrupts using a unique value associated with each +//! interrupt. This makes it possible to know which hardware device caused which interrupts. +//! These interrupt values are often called IRQ lines, or just interrupt lines. +//! +//! Nowadays, IRQ lines is not the only mechanism to deliver device interrupts to processors. +//! MSI [(Message Signaled Interrupt)](https://en.wikipedia.org/wiki/Message_Signaled_Interrupts) +//! is another commonly used alternative in-band method of signaling an interrupt, using special +//! in-band messages to replace traditional out-of-band assertion of dedicated interrupt lines. +//! While more complex to implement in a device, message signaled interrupts have some significant +//! advantages over pin-based out-of-band interrupt signaling. Message signaled interrupts are +//! supported in PCI bus since its version 2.2, and in later available PCI Express bus. Some +//! non-PCI architectures also use message signaled interrupts. +//! +//! While IRQ is a term commonly used by Operating Systems when dealing with hardware +//! interrupts, the IRQ numbers managed by OSes are independent of the ones managed by VMM. +//! For simplicity sake, the term `Interrupt Source` is used instead of IRQ to represent both +//! pin-based interrupts and MSI interrupts. +//! +//! A device may support multiple types of interrupts, and each type of interrupt may support one +//! or multiple interrupt sources. For example, a PCI device may support: +//! * Legacy Irq: exactly one interrupt source. +//! * PCI MSI Irq: 1,2,4,8,16,32 interrupt sources. +//! * PCI MSIx Irq: 2^n(n=0-11) interrupt sources. +//! +//! A distinct Interrupt Source Identifier (ISID) will be assigned to each interrupt source. +//! An ID allocator will be used to allocate and free Interrupt Source Identifiers for devices. +//! To decouple the vm-device crate from the ID allocator, the vm-device crate doesn't take the +//! responsibility to allocate/free Interrupt Source IDs but only makes use of assigned IDs. +//! +//! The overall flow to deal with interrupts is: +//! * The VMM creates an interrupt manager +//! * The VMM creates a device manager, passing on an reference to the interrupt manager +//! * The device manager passes on an reference to the interrupt manager to all registered devices +//! * The guest kernel loads drivers for virtual devices +//! * The guest device driver determines the type and number of interrupts needed, and update the +//! device configuration +//! * The virtual device backend requests the interrupt manager to create an interrupt group +//! according to guest configuration information + +use std::sync::Arc; + +use vmm_sys_util::eventfd::EventFd; + +/// Reuse std::io::Result to simplify interoperability among crates. +pub type Result = std::io::Result; + +/// Data type to store an interrupt source identifier. +pub type InterruptIndex = u32; + +/// Configuration data for legacy interrupts. +/// +/// On x86 platforms, legacy interrupts means those interrupts routed through PICs or IOAPICs. +#[derive(Copy, Clone, Debug)] +pub struct LegacyIrqSourceConfig { + pub irqchip: u32, + pub pin: u32, +} + +/// Configuration data for MSI/MSI-X interrupts. +/// +/// On x86 platforms, these interrupts are vectors delivered directly to the LAPIC. +#[derive(Copy, Clone, Debug, Default)] +pub struct MsiIrqSourceConfig { + /// High address to delivery message signaled interrupt. + pub high_addr: u32, + /// Low address to delivery message signaled interrupt. + pub low_addr: u32, + /// Data to write to delivery message signaled interrupt. + pub data: u32, + /// Unique ID of the device to delivery message signaled interrupt. + pub devid: u32, +} + +/// Configuration data for an interrupt source. +#[derive(Copy, Clone, Debug)] +pub enum InterruptSourceConfig { + /// Configuration data for Legacy interrupts. + LegacyIrq(LegacyIrqSourceConfig), + /// Configuration data for PciMsi, PciMsix and generic MSI interrupts. + MsiIrq(MsiIrqSourceConfig), +} + +/// Configuration data for legacy, pin based interrupt groups. +/// +/// A legacy interrupt group only takes one irq number as its configuration. +#[derive(Copy, Clone, Debug)] +pub struct LegacyIrqGroupConfig { + /// Legacy irq number. + pub irq: InterruptIndex, +} + +/// Configuration data for MSI/MSI-X interrupt groups +/// +/// MSI/MSI-X interrupt groups are basically a set of vectors. +#[derive(Copy, Clone, Debug)] +pub struct MsiIrqGroupConfig { + /// First index of the MSI/MSI-X interrupt vectors + pub base: InterruptIndex, + /// Number of vectors in the MSI/MSI-X group. + pub count: InterruptIndex, +} + +/// Trait to manage interrupt sources for virtual device backends. +/// +/// The InterruptManager implementations should protect itself from concurrent accesses internally, +/// so it could be invoked from multi-threaded context. +pub trait InterruptManager: Send + Sync { + type GroupConfig; + + /// Create an [InterruptSourceGroup](trait.InterruptSourceGroup.html) object to manage + /// interrupt sources for a virtual device + /// + /// An [InterruptSourceGroup](trait.InterruptSourceGroup.html) object manages all interrupt + /// sources of the same type for a virtual device. + /// + /// # Arguments + /// * interrupt_type: type of interrupt source. + /// * base: base Interrupt Source ID to be managed by the group object. + /// * count: number of Interrupt Sources to be managed by the group object. + fn create_group(&self, config: Self::GroupConfig) -> Result>; + + /// Destroy an [InterruptSourceGroup](trait.InterruptSourceGroup.html) object created by + /// [create_group()](trait.InterruptManager.html#tymethod.create_group). + /// + /// Assume the caller takes the responsibility to disable all interrupt sources of the group + /// before calling destroy_group(). This assumption helps to simplify InterruptSourceGroup + /// implementations. + fn destroy_group(&self, group: Arc) -> Result<()>; +} + +pub trait InterruptSourceGroup: Send + Sync { + /// Enable the interrupt sources in the group to generate interrupts. + fn enable(&self) -> Result<()> { + // Not all interrupt sources can be enabled. + // To accommodate this, we can have a no-op here. + Ok(()) + } + + /// Disable the interrupt sources in the group to generate interrupts. + fn disable(&self) -> Result<()> { + // Not all interrupt sources can be disabled. + // To accommodate this, we can have a no-op here. + Ok(()) + } + + /// Inject an interrupt from this interrupt source into the guest. + fn trigger(&self, index: InterruptIndex) -> Result<()>; + + /// Returns an interrupt notifier from this interrupt. + /// + /// An interrupt notifier allows for external components and processes + /// to inject interrupts into a guest, by writing to the file returned + /// by this method. + #[allow(unused_variables)] + fn notifier(&self, index: InterruptIndex) -> Option; + + /// Update the interrupt source group configuration. + /// + /// # Arguments + /// * index: sub-index into the group. + /// * config: configuration data for the interrupt source. + /// * masked: if the interrupt is masked + /// * set_gsi: whether update the GSI routing table. + fn update( + &self, + index: InterruptIndex, + config: InterruptSourceConfig, + masked: bool, + set_gsi: bool, + ) -> Result<()>; + + /// Set the interrupt group GSI routing table. + fn set_gsi(&self) -> Result<()>; +} diff --git a/src/vm-device/src/lib.rs b/src/vm-device/src/lib.rs new file mode 100644 index 00000000000..fe06fd8b465 --- /dev/null +++ b/src/vm-device/src/lib.rs @@ -0,0 +1,63 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Copyright © 2020 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +use serde::{Deserialize, Serialize}; + +mod bus; +pub mod dma_mapping; +pub mod interrupt; + +pub use self::bus::{Bus, BusDevice, BusDeviceSync, Error as BusError}; + +/// Type of Message Signalled Interrupt +#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub enum MsiIrqType { + /// PCI MSI IRQ numbers. + PciMsi, + /// PCI MSIx IRQ numbers. + PciMsix, + /// Generic MSI IRQ numbers. + GenericMsi, +} + +#[derive(Copy, Clone, PartialEq, Eq, Serialize, Deserialize, Debug)] +pub enum PciBarType { + Io, + Mmio32, + Mmio64, +} + +/// Enumeration for device resources. +#[allow(missing_docs)] +#[derive(Clone, Debug, Serialize, Deserialize)] +pub enum Resource { + /// IO Port address range. + PioAddressRange { base: u16, size: u16 }, + /// Memory Mapped IO address range. + MmioAddressRange { base: u64, size: u64 }, + /// PCI BAR + PciBar { + index: usize, + base: u64, + size: u64, + type_: PciBarType, + prefetchable: bool, + }, + /// Legacy IRQ number. + LegacyIrq(u32), + /// Message Signaled Interrupt + MsiIrq { + ty: MsiIrqType, + base: u32, + size: u32, + }, + /// Network Interface Card MAC address. + MacAddress(String), + /// KVM memslot index. + KvmMemSlot(u32), +} diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index 98d95a615c2..2bf62073443 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -22,7 +22,10 @@ base64 = "0.22.1" bincode = { version = "2.0.1", features = ["serde"] } bitflags = "2.9.1" crc64 = "2.0.0" -derive_more = { version = "2.0.1", default-features = false, features = ["from", "display"] } +derive_more = { version = "2.0.1", default-features = false, features = [ + "from", + "display", +] } displaydoc = "0.2.5" event-manager = "0.4.1" gdbstub = { version = "0.7.6", optional = true } @@ -45,7 +48,10 @@ userfaultfd = "0.8.1" utils = { path = "../utils" } vhost = { version = "0.14.0", features = ["vhost-user-frontend"] } vm-allocator = "0.1.3" -vm-memory = { version = "0.16.2", features = ["backend-mmap", "backend-bitmap"] } +vm-memory = { version = "0.16.2", features = [ + "backend-mmap", + "backend-bitmap", +] } vm-superio = "0.8.0" vmm-sys-util = { version = "0.14.0", features = ["with-serde"] } zerocopy = { version = "0.8.26" } From b6a8bfc4ce95f46abb0bad8f1e40b843e5e98179 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Thu, 24 Apr 2025 16:17:13 +0200 Subject: [PATCH 07/99] refactor(serial): remove generics from SerialDevice We use `SerialDevice` with Stdin as the input source. Encode this in the type so that we don't spill the generic all over the place. Signed-off-by: Babis Chalios Co-authored-by: Egor Lazarchuk Signed-off-by: Egor Lazarchuk Signed-off-by: Babis Chalios --- src/vmm/src/builder.rs | 45 +++++++++------------------ src/vmm/src/device_manager/persist.rs | 7 ++--- src/vmm/src/devices/bus.rs | 6 ++-- src/vmm/src/devices/legacy/serial.rs | 27 ++++++++++++++-- 4 files changed, 43 insertions(+), 42 deletions(-) diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 88e11ba25f9..963e47b2d1f 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -10,7 +10,6 @@ use std::sync::mpsc; use std::sync::{Arc, Mutex}; use event_manager::{MutEventSubscriber, SubscriberOps}; -use libc::EFD_NONBLOCK; use linux_loader::cmdline::Cmdline as LoaderKernelCmdline; use userfaultfd::Uffd; use utils::time::TimestampUs; @@ -18,8 +17,6 @@ use utils::time::TimestampUs; use vm_memory::GuestAddress; #[cfg(target_arch = "aarch64")] use vm_superio::Rtc; -use vm_superio::Serial; -use vmm_sys_util::eventfd::EventFd; use crate::arch::{ConfigurationError, configure_system_for_boot, load_kernel}; #[cfg(target_arch = "aarch64")] @@ -39,8 +36,8 @@ use crate::devices::BusDevice; use crate::devices::acpi::vmgenid::{VmGenId, VmGenIdError}; #[cfg(target_arch = "aarch64")] use crate::devices::legacy::RTCDevice; +use crate::devices::legacy::SerialDevice; use crate::devices::legacy::serial::SerialOut; -use crate::devices::legacy::{EventFdTrigger, SerialEventsWrapper, SerialWrapper}; use crate::devices::virtio::balloon::Balloon; use crate::devices::virtio::block::device::Block; use crate::devices::virtio::device::VirtioDevice; @@ -163,7 +160,7 @@ fn create_vmm_and_vcpus( set_stdout_nonblocking(); // Serial device setup. - let serial_device = setup_serial_device(event_manager, std::io::stdin(), io::stdout())?; + let serial_device = setup_serial_device(event_manager)?; // x86_64 uses the i8042 reset event as the Vmm exit event. let reset_evt = vcpus_exit_evt.try_clone().map_err(VmmError::EventFd)?; @@ -554,22 +551,11 @@ pub fn build_microvm_from_snapshot( /// Sets up the serial device. pub fn setup_serial_device( event_manager: &mut EventManager, - input: std::io::Stdin, - out: std::io::Stdout, ) -> Result>, VmmError> { - let interrupt_evt = EventFdTrigger::new(EventFd::new(EFD_NONBLOCK).map_err(VmmError::EventFd)?); - let kick_stdin_read_evt = - EventFdTrigger::new(EventFd::new(EFD_NONBLOCK).map_err(VmmError::EventFd)?); - let serial = Arc::new(Mutex::new(BusDevice::Serial(SerialWrapper { - serial: Serial::with_events( - interrupt_evt, - SerialEventsWrapper { - buffer_ready_event_fd: Some(kick_stdin_read_evt), - }, - SerialOut::Stdout(out), - ), - input: Some(input), - }))); + let serial = Arc::new(Mutex::new(BusDevice::Serial( + SerialDevice::new(Some(std::io::stdin()), SerialOut::Stdout(std::io::stdout())) + .map_err(VmmError::EventFd)?, + ))); event_manager.add_subscriber(serial.clone()); Ok(serial) } @@ -629,7 +615,7 @@ fn attach_legacy_devices_aarch64( if cmdline_contains_console { // Make stdout non-blocking. set_stdout_nonblocking(); - let serial = setup_serial_device(event_manager, std::io::stdin(), std::io::stdout())?; + let serial = setup_serial_device(event_manager)?; vmm.mmio_device_manager .register_mmio_serial(vmm.vm.fd(), &mut vmm.resource_allocator, serial, None) .map_err(VmmError::RegisterMMIODevice)?; @@ -809,11 +795,15 @@ pub(crate) fn set_stdout_nonblocking() { pub(crate) mod tests { use linux_loader::cmdline::Cmdline; + #[cfg(target_arch = "x86_64")] + use vmm_sys_util::eventfd::EventFd; use vmm_sys_util::tempfile::TempFile; use super::*; use crate::arch::DeviceType; use crate::device_manager::resources::ResourceAllocator; + #[cfg(target_arch = "x86_64")] + use crate::devices::legacy::serial::SerialOut; use crate::devices::virtio::block::CacheType; use crate::devices::virtio::rng::device::ENTROPY_DEV_ID; use crate::devices::virtio::vsock::{TYPE_VSOCK, VSOCK_DEV_ID}; @@ -890,16 +880,9 @@ pub(crate) mod tests { let acpi_device_manager = ACPIDeviceManager::new(); #[cfg(target_arch = "x86_64")] let pio_device_manager = PortIODeviceManager::new( - Arc::new(Mutex::new(BusDevice::Serial(SerialWrapper { - serial: Serial::with_events( - EventFdTrigger::new(EventFd::new(EFD_NONBLOCK).unwrap()), - SerialEventsWrapper { - buffer_ready_event_fd: None, - }, - SerialOut::Sink(std::io::sink()), - ), - input: None, - }))), + Arc::new(Mutex::new(BusDevice::Serial( + SerialDevice::new(None, SerialOut::Sink(std::io::sink())).unwrap(), + ))), EventFd::new(libc::EFD_NONBLOCK).unwrap(), ) .unwrap(); diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 58c7134aa7f..5e5e53be7cb 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -403,11 +403,8 @@ impl<'a> Persist<'a> for MMIODeviceManager { { for state in &state.legacy_devices { if state.type_ == DeviceType::Serial { - let serial = crate::builder::setup_serial_device( - constructor_args.event_manager, - std::io::stdin(), - std::io::stdout(), - )?; + let serial = + crate::builder::setup_serial_device(constructor_args.event_manager)?; constructor_args .resource_allocator diff --git a/src/vmm/src/devices/bus.rs b/src/vmm/src/devices/bus.rs index d0e1b296998..516b40cc93f 100644 --- a/src/vmm/src/devices/bus.rs +++ b/src/vmm/src/devices/bus.rs @@ -65,7 +65,7 @@ pub enum BusDevice { RTCDevice(RTCDevice), BootTimer(BootTimer), MmioTransport(MmioTransport), - Serial(SerialDevice), + Serial(SerialDevice), #[cfg(test)] Dummy(DummyDevice), #[cfg(test)] @@ -127,7 +127,7 @@ impl BusDevice { _ => None, } } - pub fn serial_ref(&self) -> Option<&SerialDevice> { + pub fn serial_ref(&self) -> Option<&SerialDevice> { match self { Self::Serial(x) => Some(x), _ => None, @@ -159,7 +159,7 @@ impl BusDevice { _ => None, } } - pub fn serial_mut(&mut self) -> Option<&mut SerialDevice> { + pub fn serial_mut(&mut self) -> Option<&mut SerialDevice> { match self { Self::Serial(x) => Some(x), _ => None, diff --git a/src/vmm/src/devices/legacy/serial.rs b/src/vmm/src/devices/legacy/serial.rs index 278c15a4464..c73534e76c4 100644 --- a/src/vmm/src/devices/legacy/serial.rs +++ b/src/vmm/src/devices/legacy/serial.rs @@ -7,16 +7,17 @@ //! Implements a wrapper over an UART serial device. use std::fmt::Debug; -use std::io; -use std::io::{Read, Write}; +use std::io::{self, Read, Stdin, Write}; use std::os::unix::io::{AsRawFd, RawFd}; use event_manager::{EventOps, Events, MutEventSubscriber}; +use libc::EFD_NONBLOCK; use log::{error, warn}; use serde::Serialize; use vm_superio::serial::{Error as SerialError, SerialEvents}; use vm_superio::{Serial, Trigger}; use vmm_sys_util::epoll::EventSet; +use vmm_sys_util::eventfd::EventFd; use crate::devices::legacy::EventFdTrigger; use crate::logger::{IncMetric, SharedIncMetric}; @@ -220,7 +221,27 @@ impl SerialWrapper = SerialWrapper; +pub type SerialDevice = SerialWrapper; + +impl SerialDevice { + pub fn new(serial_in: Option, serial_out: SerialOut) -> Result { + let interrupt_evt = EventFdTrigger::new(EventFd::new(EFD_NONBLOCK)?); + let buffer_read_event_fd = EventFdTrigger::new(EventFd::new(EFD_NONBLOCK)?); + + let serial = Serial::with_events( + interrupt_evt, + SerialEventsWrapper { + buffer_ready_event_fd: Some(buffer_read_event_fd), + }, + serial_out, + ); + + Ok(SerialDevice { + serial, + input: serial_in, + }) + } +} impl MutEventSubscriber for SerialWrapper From e226c9405469e69b1f8ae631e205caf832795d32 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Thu, 24 Apr 2025 15:24:47 +0200 Subject: [PATCH 08/99] refactor: use vm_device::Bus as the MMIO bus Use the vm_device::Bus bus for all MMIO devices. This is mainly to prepare for using it for PCIe devices. Also, sepate VirtIO devices from other MMIO devices inside the MMIODeviceManager struct. This makes iterating over VirtIO devices easier since we don't need to access two data structures to get a reference to a VirtIO device any more. Signed-off-by: Babis Chalios --- Cargo.lock | 1 + src/vmm/Cargo.toml | 1 + src/vmm/src/arch/aarch64/fdt.rs | 96 ++--- src/vmm/src/arch/aarch64/mod.rs | 4 +- src/vmm/src/arch/aarch64/vcpu.rs | 3 +- src/vmm/src/arch/x86_64/vcpu.rs | 3 +- src/vmm/src/builder.rs | 37 +- src/vmm/src/device_manager/mmio.rs | 415 ++++++++++--------- src/vmm/src/device_manager/persist.rs | 90 ++-- src/vmm/src/devices/bus.rs | 68 +-- src/vmm/src/devices/legacy/rtc_pl031.rs | 19 +- src/vmm/src/devices/legacy/serial.rs | 19 + src/vmm/src/devices/pseudo/boot_timer.rs | 15 +- src/vmm/src/devices/virtio/transport/mmio.rs | 128 +++--- src/vmm/src/lib.rs | 74 +--- src/vmm/src/persist.rs | 7 +- src/vmm/src/vstate/vcpu.rs | 17 +- 17 files changed, 495 insertions(+), 502 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 044ab7bc025..aff0432be7f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1655,6 +1655,7 @@ dependencies = [ "utils", "vhost", "vm-allocator", + "vm-device", "vm-fdt", "vm-memory", "vm-superio", diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index 2bf62073443..e6c600378c1 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -48,6 +48,7 @@ userfaultfd = "0.8.1" utils = { path = "../utils" } vhost = { version = "0.14.0", features = ["vhost-user-frontend"] } vm-allocator = "0.1.3" +vm-device = { path = "../vm-device" } vm-memory = { version = "0.16.2", features = [ "backend-mmap", "backend-bitmap", diff --git a/src/vmm/src/arch/aarch64/fdt.rs b/src/vmm/src/arch/aarch64/fdt.rs index 7d7f7d748a9..359f47c7044 100644 --- a/src/vmm/src/arch/aarch64/fdt.rs +++ b/src/vmm/src/arch/aarch64/fdt.rs @@ -5,14 +5,12 @@ // Use of this source code is governed by a BSD-style license that can be // found in the THIRD-PARTY file. -use std::collections::HashMap; use std::ffi::CString; use std::fmt::Debug; use vm_fdt::{Error as VmFdtError, FdtWriter, FdtWriterNode}; use vm_memory::GuestMemoryError; -use super::super::DeviceType; use super::cache_info::{CacheEntry, read_cache_config}; use super::gic::GICDevice; use crate::device_manager::mmio::MMIODeviceInfo; @@ -55,12 +53,15 @@ pub enum FdtError { WriteFdtToMemory(#[from] GuestMemoryError), } +#[allow(clippy::too_many_arguments)] /// Creates the flattened device tree for this aarch64 microVM. pub fn create_fdt( guest_mem: &GuestMemoryMmap, vcpu_mpidr: Vec, cmdline: CString, - device_info: &HashMap<(DeviceType, String), MMIODeviceInfo>, + virtio_devices: Vec<&MMIODeviceInfo>, + rtc: Option<&MMIODeviceInfo>, + serial: Option<&MMIODeviceInfo>, gic_device: &GICDevice, vmgenid: &Option, initrd: &Option, @@ -89,7 +90,7 @@ pub fn create_fdt( create_timer_node(&mut fdt_writer)?; create_clock_node(&mut fdt_writer)?; create_psci_node(&mut fdt_writer)?; - create_devices_node(&mut fdt_writer, device_info)?; + create_devices_node(&mut fdt_writer, virtio_devices, rtc, serial)?; create_vmgenid_node(&mut fdt_writer, vmgenid)?; // End Header node. @@ -411,25 +412,21 @@ fn create_rtc_node(fdt: &mut FdtWriter, dev_info: &MMIODeviceInfo) -> Result<(), fn create_devices_node( fdt: &mut FdtWriter, - dev_info: &HashMap<(DeviceType, String), MMIODeviceInfo>, + mut virtio_devices: Vec<&MMIODeviceInfo>, + rtc: Option<&MMIODeviceInfo>, + serial: Option<&MMIODeviceInfo>, ) -> Result<(), FdtError> { - // Create one temp Vec to store all virtio devices - let mut ordered_virtio_device: Vec<&MMIODeviceInfo> = Vec::new(); - - for ((device_type, _device_id), info) in dev_info { - match device_type { - DeviceType::BootTimer => (), // since it's not a real device - DeviceType::Rtc => create_rtc_node(fdt, info)?, - DeviceType::Serial => create_serial_node(fdt, info)?, - DeviceType::Virtio(_) => { - ordered_virtio_device.push(info); - } - } + if let Some(device_info) = rtc { + create_rtc_node(fdt, device_info)?; + } + + if let Some(device_info) = serial { + create_serial_node(fdt, device_info)?; } // Sort out virtio devices by address from low to high and insert them into fdt table. - ordered_virtio_device.sort_by_key(|a| a.addr); - for ordered_device_info in ordered_virtio_device.drain(..) { + virtio_devices.sort_by_key(|a| a.addr); + for ordered_device_info in virtio_devices.drain(..) { create_virtio_node(fdt, ordered_device_info)?; } @@ -464,35 +461,22 @@ mod tests { fn test_create_fdt_with_devices() { let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); - let dev_info: HashMap<(DeviceType, std::string::String), MMIODeviceInfo> = [ - ( - (DeviceType::Serial, DeviceType::Serial.to_string()), - MMIODeviceInfo { - addr: 0x00, - irq: Some(1u32), - len: LEN, - }, - ), - ( - (DeviceType::Virtio(1), "virtio".to_string()), - MMIODeviceInfo { - addr: LEN, - irq: Some(2u32), - len: LEN, - }, - ), - ( - (DeviceType::Rtc, "rtc".to_string()), - MMIODeviceInfo { - addr: 2 * LEN, - irq: Some(3u32), - len: LEN, - }, - ), - ] - .iter() - .cloned() - .collect(); + let serial = MMIODeviceInfo { + addr: 0x00, + irq: Some(1u32), + len: LEN, + }; + let virtio_device = MMIODeviceInfo { + addr: LEN, + irq: Some(2u32), + len: LEN, + }; + let rtc = MMIODeviceInfo { + addr: 2 * LEN, + irq: Some(3u32), + len: LEN, + }; + let kvm = Kvm::new().unwrap(); let vm = kvm.create_vm().unwrap(); let gic = create_gic(&vm, 1, None).unwrap(); @@ -500,7 +484,9 @@ mod tests { &mem, vec![0], CString::new("console=tty0").unwrap(), - &dev_info, + vec![&virtio_device], + Some(&rtc), + Some(&serial), &gic, &None, &None, @@ -520,7 +506,9 @@ mod tests { &mem, vec![0], CString::new("console=tty0").unwrap(), - &HashMap::<(DeviceType, std::string::String), MMIODeviceInfo>::new(), + Vec::new(), + None, + None, &gic, &Some(vmgenid), &None, @@ -545,7 +533,9 @@ mod tests { &mem, vec![0], CString::new("console=tty0").unwrap(), - &HashMap::<(DeviceType, std::string::String), MMIODeviceInfo>::new(), + Vec::new(), + None, + None, &gic, &None, &None, @@ -607,7 +597,9 @@ mod tests { &mem, vec![0], CString::new("console=tty0").unwrap(), - &HashMap::<(DeviceType, std::string::String), MMIODeviceInfo>::new(), + vec![], + None, + None, &gic, &None, &Some(initrd), diff --git a/src/vmm/src/arch/aarch64/mod.rs b/src/vmm/src/arch/aarch64/mod.rs index ead827c08c4..f945601c940 100644 --- a/src/vmm/src/arch/aarch64/mod.rs +++ b/src/vmm/src/arch/aarch64/mod.rs @@ -134,7 +134,9 @@ pub fn configure_system_for_boot( vmm.vm.guest_memory(), vcpu_mpidr, cmdline, - vmm.mmio_device_manager.get_device_info(), + vmm.mmio_device_manager.virtio_device_info(), + vmm.mmio_device_manager.rtc_device_info(), + vmm.mmio_device_manager.serial_device_info(), vmm.vm.get_irqchip(), &vmm.acpi_device_manager.vmgenid, initrd, diff --git a/src/vmm/src/arch/aarch64/vcpu.rs b/src/vmm/src/arch/aarch64/vcpu.rs index 5d49dacac19..7a591bdee91 100644 --- a/src/vmm/src/arch/aarch64/vcpu.rs +++ b/src/vmm/src/arch/aarch64/vcpu.rs @@ -7,6 +7,7 @@ use std::fmt::{Debug, Write}; use std::mem::offset_of; +use std::sync::Arc; use kvm_bindings::*; use kvm_ioctls::{VcpuExit, VcpuFd, VmFd}; @@ -119,7 +120,7 @@ pub struct KvmVcpu { #[derive(Default, Debug)] pub struct Peripherals { /// mmio bus. - pub mmio_bus: Option, + pub mmio_bus: Option>, } impl KvmVcpu { diff --git a/src/vmm/src/arch/x86_64/vcpu.rs b/src/vmm/src/arch/x86_64/vcpu.rs index b46d8e07b59..4eb609aadd6 100644 --- a/src/vmm/src/arch/x86_64/vcpu.rs +++ b/src/vmm/src/arch/x86_64/vcpu.rs @@ -7,6 +7,7 @@ use std::collections::BTreeMap; use std::fmt::Debug; +use std::sync::Arc; use kvm_bindings::{ CpuId, KVM_MAX_CPUID_ENTRIES, KVM_MAX_MSR_ENTRIES, Msrs, Xsave, kvm_debugregs, kvm_lapic_state, @@ -161,7 +162,7 @@ pub struct Peripherals { /// Pio bus. pub pio_bus: Option, /// Mmio bus. - pub mmio_bus: Option, + pub mmio_bus: Option>, } impl KvmVcpu { diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 963e47b2d1f..f867188b9bb 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -615,7 +615,11 @@ fn attach_legacy_devices_aarch64( if cmdline_contains_console { // Make stdout non-blocking. set_stdout_nonblocking(); - let serial = setup_serial_device(event_manager)?; + let serial = Arc::new(Mutex::new( + SerialDevice::new(Some(std::io::stdin()), SerialOut::Stdout(std::io::stdout())) + .map_err(VmmError::EventFd)?, + )); + event_manager.add_subscriber(serial.clone()); vmm.mmio_device_manager .register_mmio_serial(vmm.vm.fd(), &mut vmm.resource_allocator, serial, None) .map_err(VmmError::RegisterMMIODevice)?; @@ -800,7 +804,6 @@ pub(crate) mod tests { use vmm_sys_util::tempfile::TempFile; use super::*; - use crate::arch::DeviceType; use crate::device_manager::resources::ResourceAllocator; #[cfg(target_arch = "x86_64")] use crate::devices::legacy::serial::SerialOut; @@ -999,7 +1002,7 @@ pub(crate) mod tests { assert!( vmm.mmio_device_manager - .get_device(DeviceType::Virtio(TYPE_VSOCK), &vsock_dev_id) + .get_virtio_device(TYPE_VSOCK, &vsock_dev_id) .is_some() ); } @@ -1017,7 +1020,7 @@ pub(crate) mod tests { assert!( vmm.mmio_device_manager - .get_device(DeviceType::Virtio(TYPE_RNG), ENTROPY_DEV_ID) + .get_virtio_device(TYPE_RNG, ENTROPY_DEV_ID) .is_some() ); } @@ -1042,7 +1045,7 @@ pub(crate) mod tests { assert!( vmm.mmio_device_manager - .get_device(DeviceType::Virtio(TYPE_BALLOON), BALLOON_DEV_ID) + .get_virtio_device(TYPE_BALLOON, BALLOON_DEV_ID) .is_some() ); } @@ -1093,7 +1096,7 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=/dev/vda ro")); assert!( vmm.mmio_device_manager - .get_device(DeviceType::Virtio(TYPE_BLOCK), drive_id.as_str()) + .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); } @@ -1114,7 +1117,7 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=PARTUUID=0eaa91a0-01 rw")); assert!( vmm.mmio_device_manager - .get_device(DeviceType::Virtio(TYPE_BLOCK), drive_id.as_str()) + .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); } @@ -1136,7 +1139,7 @@ pub(crate) mod tests { assert!(!cmdline_contains(&cmdline, "root=/dev/vda")); assert!( vmm.mmio_device_manager - .get_device(DeviceType::Virtio(TYPE_BLOCK), drive_id.as_str()) + .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); } @@ -1173,17 +1176,17 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=PARTUUID=0eaa91a0-01 rw")); assert!( vmm.mmio_device_manager - .get_device(DeviceType::Virtio(TYPE_BLOCK), "root") + .get_virtio_device(TYPE_BLOCK, "root") .is_some() ); assert!( vmm.mmio_device_manager - .get_device(DeviceType::Virtio(TYPE_BLOCK), "secondary") + .get_virtio_device(TYPE_BLOCK, "secondary") .is_some() ); assert!( vmm.mmio_device_manager - .get_device(DeviceType::Virtio(TYPE_BLOCK), "third") + .get_virtio_device(TYPE_BLOCK, "third") .is_some() ); @@ -1212,7 +1215,7 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=/dev/vda rw")); assert!( vmm.mmio_device_manager - .get_device(DeviceType::Virtio(TYPE_BLOCK), drive_id.as_str()) + .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); } @@ -1233,7 +1236,7 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=PARTUUID=0eaa91a0-01 ro")); assert!( vmm.mmio_device_manager - .get_device(DeviceType::Virtio(TYPE_BLOCK), drive_id.as_str()) + .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); } @@ -1254,7 +1257,7 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=/dev/vda rw")); assert!( vmm.mmio_device_manager - .get_device(DeviceType::Virtio(TYPE_BLOCK), drive_id.as_str()) + .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); } @@ -1267,11 +1270,7 @@ pub(crate) mod tests { let res = attach_boot_timer_device(&mut vmm, request_ts); res.unwrap(); - assert!( - vmm.mmio_device_manager - .get_device(DeviceType::BootTimer, &DeviceType::BootTimer.to_string()) - .is_some() - ); + assert!(vmm.mmio_device_manager.boot_timer.is_some()); } #[test] diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index f99db17e747..55e5dbc402f 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -20,11 +20,8 @@ use serde::{Deserialize, Serialize}; use vm_allocator::AllocPolicy; use super::resources::ResourceAllocator; -use crate::arch::DeviceType; -use crate::arch::DeviceType::Virtio; -use crate::devices::BusDevice; #[cfg(target_arch = "aarch64")] -use crate::devices::legacy::RTCDevice; +use crate::devices::legacy::{RTCDevice, SerialDevice}; use crate::devices::pseudo::BootTimer; use crate::devices::virtio::balloon::Balloon; use crate::devices::virtio::block::device::Block; @@ -43,9 +40,9 @@ pub enum MmioError { /// Failed to allocate requested resource: {0} Allocator(#[from] vm_allocator::Error), /// Failed to insert device on the bus: {0} - BusInsert(crate::devices::BusError), + BusInsert(#[from] vm_device::BusError), /// Failed to allocate requested resourc: {0} - Cmdline(linux_loader::cmdline::Error), + Cmdline(#[from] linux_loader::cmdline::Error), /// Failed to find the device on the bus. DeviceNotFound, /// Invalid device type found on the MMIO bus. @@ -73,7 +70,7 @@ pub enum MmioError { pub const MMIO_LEN: u64 = 0x1000; /// Stores the address range and irq allocated to this device. -#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct MMIODeviceInfo { /// Mmio address at which the device is registered. pub addr: u64, @@ -117,11 +114,30 @@ fn add_virtio_aml( .append_aml_bytes(dsdt_data) } +#[derive(Debug, Clone)] +/// A descriptor for MMIO devices +pub struct MMIODevice { + /// MMIO resources allocated to the device + pub(crate) resources: MMIODeviceInfo, + /// The actual device + pub(crate) inner: Arc>, +} + /// Manages the complexities of registering a MMIO device. #[derive(Debug)] pub struct MMIODeviceManager { - pub(crate) bus: crate::devices::Bus, - pub(crate) id_to_dev_info: HashMap<(DeviceType, String), MMIODeviceInfo>, + pub(crate) bus: Arc, + /// VirtIO devices using an MMIO transport layer + pub(crate) virtio_devices: HashMap<(u32, String), MMIODevice>, + /// Boot timer device + pub(crate) boot_timer: Option>, + #[cfg(target_arch = "aarch64")] + /// Real-Time clock on Aarch64 platforms + pub(crate) rtc: Option>, + #[cfg(target_arch = "aarch64")] + /// Serial device on Aarch64 platforms + pub(crate) serial: Option>, + #[cfg(target_arch = "x86_64")] // We create the AML byte code for every VirtIO device in the order we build // it, so that we ensure the root block device is appears first in the DSDT. // This is needed, so that the root device appears as `/dev/vda` in the guest @@ -129,7 +145,6 @@ pub struct MMIODeviceManager { // The alternative would be that we iterate the bus to get the data after all // of the devices are build. However, iterating the bus won't give us the // devices in the order they were added. - #[cfg(target_arch = "x86_64")] pub(crate) dsdt_data: Vec, } @@ -137,8 +152,13 @@ impl MMIODeviceManager { /// Create a new DeviceManager handling mmio devices (virtio net, block). pub fn new() -> MMIODeviceManager { MMIODeviceManager { - bus: crate::devices::Bus::new(), - id_to_dev_info: HashMap::new(), + bus: Arc::new(vm_device::Bus::new()), + virtio_devices: HashMap::new(), + boot_timer: None, + #[cfg(target_arch = "aarch64")] + rtc: None, + #[cfg(target_arch = "aarch64")] + serial: None, #[cfg(target_arch = "x86_64")] dsdt_data: vec![], } @@ -168,20 +188,6 @@ impl MMIODeviceManager { Ok(device_info) } - /// Register a device at some MMIO address. - fn register_mmio_device( - &mut self, - identifier: (DeviceType, String), - device_info: MMIODeviceInfo, - device: Arc>, - ) -> Result<(), MmioError> { - self.bus - .insert(device, device_info.addr, device_info.len) - .map_err(MmioError::BusInsert)?; - self.id_to_dev_info.insert(identifier, device_info); - Ok(()) - } - /// Register a virtio-over-MMIO device to be used via MMIO transport at a specific slot. pub fn register_mmio_virtio( &mut self, @@ -198,7 +204,7 @@ impl MMIODeviceManager { let identifier; { let locked_device = mmio_device.locked_device(); - identifier = (DeviceType::Virtio(locked_device.device_type()), device_id); + identifier = (locked_device.device_type(), device_id); for (i, queue_evt) in locked_device.queue_events().iter().enumerate() { let io_addr = IoEventAddress::Mmio( device_info.addr + u64::from(crate::devices::virtio::NOTIFY_REG_OFFSET), @@ -210,11 +216,18 @@ impl MMIODeviceManager { .map_err(MmioError::RegisterIrqFd)?; } - self.register_mmio_device( + let device = Arc::new(Mutex::new(mmio_device)); + self.bus + .insert(device.clone(), device_info.addr, device_info.len)?; + self.virtio_devices.insert( identifier, - device_info.clone(), - Arc::new(Mutex::new(BusDevice::MmioTransport(mmio_device))), - ) + MMIODevice { + resources: *device_info, + inner: device, + }, + ); + + Ok(()) } /// Append a registered virtio-over-MMIO device to the kernel cmdline. @@ -272,7 +285,7 @@ impl MMIODeviceManager { &mut self, vm: &VmFd, resource_allocator: &mut ResourceAllocator, - serial: Arc>, + serial: Arc>, device_info_opt: Option, ) -> Result<(), MmioError> { // Create a new MMIODeviceInfo object on boot path or unwrap the @@ -284,20 +297,18 @@ impl MMIODeviceManager { }; vm.register_irqfd( - serial - .lock() - .expect("Poisoned lock") - .serial_ref() - .unwrap() - .serial - .interrupt_evt(), + serial.lock().expect("Poisoned lock").serial.interrupt_evt(), device_info.irq.unwrap(), ) .map_err(MmioError::RegisterIrqFd)?; - let identifier = (DeviceType::Serial, DeviceType::Serial.to_string()); - // Register the newly created Serial object. - self.register_mmio_device(identifier, device_info, serial) + self.bus + .insert(serial.clone(), device_info.addr, device_info.len)?; + self.serial = Some(MMIODevice { + resources: device_info, + inner: serial, + }); + Ok(()) } #[cfg(target_arch = "aarch64")] @@ -306,13 +317,16 @@ impl MMIODeviceManager { &self, cmdline: &mut kernel_cmdline::Cmdline, ) -> Result<(), MmioError> { - let device_info = self - .id_to_dev_info - .get(&(DeviceType::Serial, DeviceType::Serial.to_string())) - .ok_or(MmioError::DeviceNotFound)?; - cmdline - .insert("earlycon", &format!("uart,mmio,0x{:08x}", device_info.addr)) - .map_err(MmioError::Cmdline) + match &self.serial { + Some(device) => { + cmdline.insert( + "earlycon", + &format!("uart,mmio,0x{:08x}", device.resources.addr), + )?; + Ok(()) + } + None => Err(MmioError::DeviceNotFound), + } } #[cfg(target_arch = "aarch64")] @@ -324,6 +338,7 @@ impl MMIODeviceManager { rtc: RTCDevice, device_info_opt: Option, ) -> Result<(), MmioError> { + let device = Arc::new(Mutex::new(rtc)); // Create a new MMIODeviceInfo object on boot path or unwrap the // existing object on restore path. let device_info = if let Some(device_info) = device_info_opt { @@ -332,88 +347,53 @@ impl MMIODeviceManager { self.allocate_mmio_resources(resource_allocator, 1)? }; - // Create a new identifier for the RTC device. - let identifier = (DeviceType::Rtc, DeviceType::Rtc.to_string()); - // Attach the newly created RTC device. - self.register_mmio_device( - identifier, - device_info, - Arc::new(Mutex::new(BusDevice::RTCDevice(rtc))), - ) + self.bus + .insert(device.clone(), device_info.addr, device_info.len)?; + self.rtc = Some(MMIODevice { + resources: device_info, + inner: device, + }); + + Ok(()) } /// Register a boot timer device. pub fn register_mmio_boot_timer( &mut self, resource_allocator: &mut ResourceAllocator, - device: BootTimer, + boot_timer: BootTimer, ) -> Result<(), MmioError> { // Attach a new boot timer device. let device_info = self.allocate_mmio_resources(resource_allocator, 0)?; - let identifier = (DeviceType::BootTimer, DeviceType::BootTimer.to_string()); - self.register_mmio_device( - identifier, - device_info, - Arc::new(Mutex::new(BusDevice::BootTimer(device))), - ) - } - - /// Gets the information of the devices registered up to some point in time. - pub fn get_device_info(&self) -> &HashMap<(DeviceType, String), MMIODeviceInfo> { - &self.id_to_dev_info + let device = Arc::new(Mutex::new(boot_timer)); + self.bus + .insert(device.clone(), device_info.addr, device_info.len)?; + self.boot_timer = Some(MMIODevice { + resources: device_info, + inner: device, + }); + Ok(()) } /// Gets the specified device. - pub fn get_device( + pub fn get_virtio_device( &self, - device_type: DeviceType, + virtio_type: u32, device_id: &str, - ) -> Option<&Mutex> { - if let Some(device_info) = self - .id_to_dev_info - .get(&(device_type, device_id.to_string())) - { - if let Some((_, device)) = self.bus.get_device(device_info.addr) { - return Some(device); - } - } - None - } - - /// Run fn for each registered device. - pub fn for_each_device(&self, mut f: F) -> Result<(), E> - where - F: FnMut(&DeviceType, &String, &MMIODeviceInfo, &Mutex) -> Result<(), E>, - { - for ((device_type, device_id), device_info) in self.get_device_info().iter() { - let bus_device = self - .get_device(*device_type, device_id) - // Safe to unwrap() because we know the device exists. - .unwrap(); - f(device_type, device_id, device_info, bus_device)?; - } - Ok(()) + ) -> Option<&MMIODevice> { + self.virtio_devices + .get(&(virtio_type, device_id.to_string())) } /// Run fn for each registered virtio device. pub fn for_each_virtio_device(&self, mut f: F) -> Result<(), E> where - F: FnMut(u32, &String, &MMIODeviceInfo, Arc>) -> Result<(), E>, + F: FnMut(&u32, &String, &MMIODevice) -> Result<(), E>, { - self.for_each_device(|device_type, device_id, device_info, bus_device| { - if let Virtio(virtio_type) = device_type { - let virtio_device = bus_device - .lock() - .expect("Poisoned lock") - .mmio_transport_ref() - .expect("Unexpected device type") - .device(); - f(*virtio_type, device_id, device_info, virtio_device)?; - } - Ok(()) - })?; - + for ((virtio_type, device_id), mmio_device) in &self.virtio_devices { + f(virtio_type, device_id, mmio_device)?; + } Ok(()) } @@ -428,13 +408,8 @@ impl MMIODeviceManager { T: VirtioDevice + 'static + Debug, F: FnOnce(&mut T) -> Result<(), String>, { - if let Some(busdev) = self.get_device(DeviceType::Virtio(virtio_type), id) { - let virtio_device = busdev - .lock() - .expect("Poisoned lock") - .mmio_transport_ref() - .expect("Unexpected device type") - .device(); + if let Some(device) = self.get_virtio_device(virtio_type, id) { + let virtio_device = device.inner.lock().expect("Poisoned lock").device(); let mut dev = virtio_device.lock().expect("Poisoned lock"); f(dev .as_mut_any() @@ -451,73 +426,92 @@ impl MMIODeviceManager { pub fn kick_devices(&self) { info!("Artificially kick devices."); // We only kick virtio devices for now. - let _: Result<(), MmioError> = - self.for_each_virtio_device(|virtio_type, id, _info, dev| { - let mut virtio = dev.lock().expect("Poisoned lock"); - match virtio_type { - TYPE_BALLOON => { - let balloon = virtio.as_mut_any().downcast_mut::().unwrap(); - // If device is activated, kick the balloon queue(s) to make up for any - // pending or in-flight epoll events we may have not captured in snapshot. - // Stats queue doesn't need kicking as it is notified via a `timer_fd`. - if balloon.is_activated() { - info!("kick balloon {}.", id); - balloon.process_virtio_queues().unwrap(); - } - } - TYPE_BLOCK => { - // We only care about kicking virtio block. - // If we need to kick vhost-user-block we can do nothing. - if let Some(block) = virtio.as_mut_any().downcast_mut::() { - // If device is activated, kick the block queue(s) to make up for any - // pending or in-flight epoll events we may have not captured in - // snapshot. No need to kick Ratelimiters - // because they are restored 'unblocked' so - // any inflight `timer_fd` events can be safely discarded. - if block.is_activated() { - info!("kick block {}.", id); - block.process_virtio_queues().unwrap() - } - } + let _: Result<(), MmioError> = self.for_each_virtio_device(|virtio_type, id, device| { + let mmio_transport_locked = device.inner.lock().expect("Poisoned locked"); + let mut virtio = mmio_transport_locked.locked_device(); + match *virtio_type { + TYPE_BALLOON => { + let balloon = virtio.as_mut_any().downcast_mut::().unwrap(); + // If device is activated, kick the balloon queue(s) to make up for any + // pending or in-flight epoll events we may have not captured in snapshot. + // Stats queue doesn't need kicking as it is notified via a `timer_fd`. + if balloon.is_activated() { + info!("kick balloon {}.", id); + balloon.process_virtio_queues().unwrap(); } - TYPE_NET => { - let net = virtio.as_mut_any().downcast_mut::().unwrap(); - // If device is activated, kick the net queue(s) to make up for any - // pending or in-flight epoll events we may have not captured in snapshot. - // No need to kick Ratelimiters because they are restored 'unblocked' so + } + TYPE_BLOCK => { + // We only care about kicking virtio block. + // If we need to kick vhost-user-block we can do nothing. + if let Some(block) = virtio.as_mut_any().downcast_mut::() { + // If device is activated, kick the block queue(s) to make up for any + // pending or in-flight epoll events we may have not captured in + // snapshot. No need to kick Ratelimiters + // because they are restored 'unblocked' so // any inflight `timer_fd` events can be safely discarded. - if net.is_activated() { - info!("kick net {}.", id); - net.process_virtio_queues().unwrap(); + if block.is_activated() { + info!("kick block {}.", id); + block.process_virtio_queues().unwrap(); } } - TYPE_VSOCK => { - // Vsock has complicated protocol that isn't resilient to any packet loss, - // so for Vsock we don't support connection persistence through snapshot. - // Any in-flight packets or events are simply lost. - // Vsock is restored 'empty'. - // The only reason we still `kick` it is to make guest process - // `TRANSPORT_RESET_EVENT` event we sent during snapshot creation. - let vsock = virtio - .as_mut_any() - .downcast_mut::>() - .unwrap(); - if vsock.is_activated() { - info!("kick vsock {id}."); - vsock.signal_used_queue(0).unwrap(); - } + } + TYPE_NET => { + let net = virtio.as_mut_any().downcast_mut::().unwrap(); + // If device is activated, kick the net queue(s) to make up for any + // pending or in-flight epoll events we may have not captured in snapshot. + // No need to kick Ratelimiters because they are restored 'unblocked' so + // any inflight `timer_fd` events can be safely discarded. + if net.is_activated() { + info!("kick net {}.", id); + net.process_virtio_queues().unwrap(); } - TYPE_RNG => { - let entropy = virtio.as_mut_any().downcast_mut::().unwrap(); - if entropy.is_activated() { - info!("kick entropy {id}."); - entropy.process_virtio_queues().unwrap(); - } + } + TYPE_VSOCK => { + // Vsock has complicated protocol that isn't resilient to any packet loss, + // so for Vsock we don't support connection persistence through snapshot. + // Any in-flight packets or events are simply lost. + // Vsock is restored 'empty'. + // The only reason we still `kick` it is to make guest process + // `TRANSPORT_RESET_EVENT` event we sent during snapshot creation. + let vsock = virtio + .as_mut_any() + .downcast_mut::>() + .unwrap(); + if vsock.is_activated() { + info!("kick vsock {id}."); + vsock.signal_used_queue(0).unwrap(); } - _ => (), } - Ok(()) - }); + TYPE_RNG => { + let entropy = virtio.as_mut_any().downcast_mut::().unwrap(); + if entropy.is_activated() { + info!("kick entropy {id}."); + entropy.process_virtio_queues().unwrap(); + } + } + _ => (), + } + Ok(()) + }); + } + + #[cfg(target_arch = "aarch64")] + pub fn virtio_device_info(&self) -> Vec<&MMIODeviceInfo> { + let mut device_info = Vec::new(); + for (_, dev) in self.virtio_devices.iter() { + device_info.push(&dev.resources); + } + device_info + } + + #[cfg(target_arch = "aarch64")] + pub fn rtc_device_info(&self) -> Option<&MMIODeviceInfo> { + self.rtc.as_ref().map(|device| &device.resources) + } + + #[cfg(target_arch = "aarch64")] + pub fn serial_device_info(&self) -> Option<&MMIODeviceInfo> { + self.serial.as_ref().map(|device| &device.resources) } } @@ -530,7 +524,6 @@ mod tests { use vmm_sys_util::eventfd::EventFd; use super::*; - use crate::Vm; use crate::devices::virtio::ActivateError; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::queue::Queue; @@ -539,6 +532,7 @@ mod tests { use crate::test_utils::multi_region_mem_raw; use crate::vstate::kvm::Kvm; use crate::vstate::memory::{GuestAddress, GuestMemoryMmap}; + use crate::{Vm, arch}; const QUEUE_SIZES: &[u16] = &[64]; @@ -567,9 +561,9 @@ mod tests { #[cfg(target_arch = "x86_64")] /// Gets the number of interrupts used by the devices registered. pub fn used_irqs_count(&self) -> usize { - self.get_device_info() + self.virtio_devices .iter() - .filter(|(_, device_info)| device_info.irq.is_some()) + .filter(|(_, mmio_dev)| mmio_dev.resources.irq.is_some()) .count() } } @@ -682,6 +676,29 @@ mod tests { "dummy", ) .unwrap(); + + assert!(device_manager.get_virtio_device(0, "foo").is_none()); + let dev = device_manager.get_virtio_device(0, "dummy").unwrap(); + assert_eq!(dev.resources.addr, arch::MMIO_MEM_START); + assert_eq!(dev.resources.len, MMIO_LEN); + assert_eq!( + dev.resources.irq, + Some(arch::GSI_BASE) + ); + + device_manager + .for_each_virtio_device(|virtio_type, device_id, mmio_device| { + assert_eq!(*virtio_type, 0); + assert_eq!(device_id, "dummy"); + assert_eq!(mmio_device.resources.addr, arch::MMIO_MEM_START); + assert_eq!(mmio_device.resources.len, MMIO_LEN); + assert_eq!( + mmio_device.resources.irq, + Some(arch::GSI_BASE) + ); + Ok::<(), ()>(()) + }) + .unwrap(); } #[test] @@ -772,28 +789,23 @@ mod tests { &id, ) .unwrap(); - assert!( - device_manager - .get_device(DeviceType::Virtio(type_id), &id) - .is_some() - ); + assert!(device_manager.get_virtio_device(type_id, &id).is_some()); assert_eq!( addr, - device_manager.id_to_dev_info[&(DeviceType::Virtio(type_id), id.clone())].addr + device_manager.virtio_devices[&(type_id, id.clone())] + .resources + .addr ); assert_eq!( crate::arch::GSI_BASE, - device_manager.id_to_dev_info[&(DeviceType::Virtio(type_id), id)] + device_manager.virtio_devices[&(type_id, id)] + .resources .irq .unwrap() ); let id = "bar"; - assert!( - device_manager - .get_device(DeviceType::Virtio(type_id), id) - .is_none() - ); + assert!(device_manager.get_virtio_device(type_id, id).is_none()); let dummy2 = Arc::new(Mutex::new(DummyDevice::new())); let id2 = String::from("foo2"); @@ -809,15 +821,16 @@ mod tests { .unwrap(); let mut count = 0; - let _: Result<(), MmioError> = device_manager.for_each_device(|devtype, devid, _, _| { - assert_eq!(*devtype, DeviceType::Virtio(type_id)); - match devid.as_str() { - "foo" => count += 1, - "foo2" => count += 2, - _ => unreachable!(), - }; - Ok(()) - }); + let _: Result<(), MmioError> = + device_manager.for_each_virtio_device(|devtype, devid, _| { + assert_eq!(*devtype, type_id); + match devid.as_str() { + "foo" => count += 1, + "foo2" => count += 2, + _ => unreachable!(), + }; + Ok(()) + }); assert_eq!(count, 3); #[cfg(target_arch = "x86_64")] assert_eq!(device_manager.used_irqs_count(), 2); diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 5e5e53be7cb..432301b66a1 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -19,6 +19,10 @@ use crate::EventManager; #[cfg(target_arch = "aarch64")] use crate::arch::DeviceType; use crate::devices::acpi::vmgenid::{VMGenIDState, VMGenIdConstructorArgs, VmGenId, VmGenIdError}; +#[cfg(target_arch = "aarch64")] +use crate::devices::legacy::serial::SerialOut; +#[cfg(target_arch = "aarch64")] +use crate::devices::legacy::{RTCDevice, SerialDevice}; use crate::devices::virtio::balloon::persist::{BalloonConstructorArgs, BalloonState}; use crate::devices::virtio::balloon::{Balloon, BalloonError}; use crate::devices::virtio::block::BlockError; @@ -61,7 +65,7 @@ pub enum DevicePersistError { MmioTransport, #[cfg(target_arch = "aarch64")] /// Legacy: {0} - Legacy(#[from] crate::VmmError), + Legacy(#[from] std::io::Error), /// Net: {0} Net(#[from] NetError), /// Vsock: {0} @@ -266,32 +270,29 @@ impl<'a> Persist<'a> for MMIODeviceManager { fn save(&self) -> Self::State { let mut states = DeviceStates::default(); - let _: Result<(), ()> = self.for_each_device(|devtype, devid, device_info, bus_dev| { - if *devtype == crate::arch::DeviceType::BootTimer { - // No need to save BootTimer state. - return Ok(()); - } - #[cfg(target_arch = "aarch64")] - { - if *devtype == DeviceType::Serial || *devtype == DeviceType::Rtc { - states.legacy_devices.push(ConnectedLegacyState { - type_: *devtype, - device_info: device_info.clone(), - }); - return Ok(()); - } + #[cfg(target_arch = "aarch64")] + { + if let Some(device) = &self.serial { + states.legacy_devices.push(ConnectedLegacyState { + type_: DeviceType::Serial, + device_info: device.resources, + }); } - let locked_bus_dev = bus_dev.lock().expect("Poisoned lock"); - - let mmio_transport = locked_bus_dev - .mmio_transport_ref() - .expect("Unexpected device type"); + if let Some(device) = &self.rtc { + states.legacy_devices.push(ConnectedLegacyState { + type_: DeviceType::Rtc, + device_info: device.resources, + }); + } + } - let transport_state = mmio_transport.save(); + let _: Result<(), ()> = self.for_each_virtio_device(|_, devid, device| { + let mmio_transport_locked = device.inner.lock().expect("Poisoned lock"); + let transport_state = mmio_transport_locked.save(); - let mut locked_device = mmio_transport.locked_device(); + let mut locked_device = mmio_transport_locked.locked_device(); match locked_device.device_type() { TYPE_BALLOON => { let balloon_state = locked_device @@ -303,7 +304,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { device_id: devid.clone(), device_state: balloon_state, transport_state, - device_info: device_info.clone(), + device_info: device.resources, }); } // Both virtio-block and vhost-user-block share same device type. @@ -320,7 +321,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { device_id: devid.clone(), device_state: block.save(), transport_state, - device_info: device_info.clone(), + device_info: device.resources, }) } } @@ -338,7 +339,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { device_id: devid.clone(), device_state: net.save(), transport_state, - device_info: device_info.clone(), + device_info: device.resources, }); } TYPE_VSOCK => { @@ -367,7 +368,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { device_id: devid.clone(), device_state: vsock_state, transport_state, - device_info: device_info.clone(), + device_info: device.resources, }); } TYPE_RNG => { @@ -380,7 +381,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { device_id: devid.clone(), device_state: entropy.save(), transport_state, - device_info: device_info.clone(), + device_info: device.resources, }); } _ => unreachable!(), @@ -403,8 +404,13 @@ impl<'a> Persist<'a> for MMIODeviceManager { { for state in &state.legacy_devices { if state.type_ == DeviceType::Serial { - let serial = - crate::builder::setup_serial_device(constructor_args.event_manager)?; + let serial = Arc::new(Mutex::new(SerialDevice::new( + Some(std::io::stdin()), + SerialOut::Stdout(std::io::stdout()), + )?)); + constructor_args + .event_manager + .add_subscriber(serial.clone()); constructor_args .resource_allocator @@ -421,11 +427,11 @@ impl<'a> Persist<'a> for MMIODeviceManager { vm, constructor_args.resource_allocator, serial, - Some(state.device_info.clone()), + Some(state.device_info), )?; } if state.type_ == DeviceType::Rtc { - let rtc = crate::devices::legacy::RTCDevice(vm_superio::Rtc::with_events( + let rtc = RTCDevice(vm_superio::Rtc::with_events( &crate::devices::legacy::rtc_pl031::METRICS, )); constructor_args @@ -441,7 +447,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { dev_manager.register_mmio_rtc( constructor_args.resource_allocator, rtc, - Some(state.device_info.clone()), + Some(state.device_info), )?; } } @@ -703,24 +709,32 @@ mod tests { // know will results in `Ok` let mut clone = MMIODeviceManager::new(); // We only care about the device hashmap. - clone.id_to_dev_info.clone_from(&self.id_to_dev_info); + clone.virtio_devices.clone_from(&self.virtio_devices); + clone.boot_timer = self.boot_timer.clone(); clone } } + impl PartialEq for MMIODevice { + fn eq(&self, other: &Self) -> bool { + self.resources == other.resources + } + } + impl PartialEq for MMIODeviceManager { fn eq(&self, other: &MMIODeviceManager) -> bool { // We only care about the device hashmap. - if self.id_to_dev_info.len() != other.id_to_dev_info.len() { + if self.virtio_devices.len() != other.virtio_devices.len() { return false; } - for (key, val) in &self.id_to_dev_info { - match other.id_to_dev_info.get(key) { + for (key, val) in &self.virtio_devices { + match other.virtio_devices.get(key) { Some(other_val) if val == other_val => continue, _ => return false, - }; + } } - true + + self.boot_timer == other.boot_timer } } diff --git a/src/vmm/src/devices/bus.rs b/src/vmm/src/devices/bus.rs index 516b40cc93f..6f7e1531bf3 100644 --- a/src/vmm/src/devices/bus.rs +++ b/src/vmm/src/devices/bus.rs @@ -9,6 +9,8 @@ use std::cmp::{Ord, Ordering, PartialEq, PartialOrd}; use std::collections::btree_map::BTreeMap; +#[cfg(test)] +use std::sync::Barrier; use std::sync::{Arc, Mutex}; /// Errors triggered during bus operations. @@ -55,20 +57,14 @@ use event_manager::{EventOps, Events, MutEventSubscriber}; #[cfg(target_arch = "aarch64")] use super::legacy::RTCDevice; use super::legacy::{I8042Device, SerialDevice}; -use super::pseudo::BootTimer; -use super::virtio::transport::mmio::MmioTransport; #[derive(Debug)] pub enum BusDevice { I8042Device(I8042Device), #[cfg(target_arch = "aarch64")] RTCDevice(RTCDevice), - BootTimer(BootTimer), - MmioTransport(MmioTransport), Serial(SerialDevice), #[cfg(test)] - Dummy(DummyDevice), - #[cfg(test)] Constant(ConstantDevice), } @@ -77,9 +73,11 @@ pub enum BusDevice { pub struct DummyDevice; #[cfg(test)] -impl DummyDevice { - pub fn bus_write(&mut self, _offset: u64, _data: &[u8]) {} - pub fn bus_read(&mut self, _offset: u64, _data: &[u8]) {} +impl vm_device::BusDevice for DummyDevice { + fn write(&mut self, _base: u64, _offset: u64, _data: &[u8]) -> Option> { + None + } + fn read(&mut self, _base: u64, _offset: u64, _data: &mut [u8]) {} } #[cfg(test)] @@ -115,18 +113,6 @@ impl BusDevice { _ => None, } } - pub fn boot_timer_ref(&self) -> Option<&BootTimer> { - match self { - Self::BootTimer(x) => Some(x), - _ => None, - } - } - pub fn mmio_transport_ref(&self) -> Option<&MmioTransport> { - match self { - Self::MmioTransport(x) => Some(x), - _ => None, - } - } pub fn serial_ref(&self) -> Option<&SerialDevice> { match self { Self::Serial(x) => Some(x), @@ -147,18 +133,6 @@ impl BusDevice { _ => None, } } - pub fn boot_timer_mut(&mut self) -> Option<&mut BootTimer> { - match self { - Self::BootTimer(x) => Some(x), - _ => None, - } - } - pub fn mmio_transport_mut(&mut self) -> Option<&mut MmioTransport> { - match self { - Self::MmioTransport(x) => Some(x), - _ => None, - } - } pub fn serial_mut(&mut self) -> Option<&mut SerialDevice> { match self { Self::Serial(x) => Some(x), @@ -171,11 +145,8 @@ impl BusDevice { Self::I8042Device(x) => x.bus_read(offset, data), #[cfg(target_arch = "aarch64")] Self::RTCDevice(x) => x.bus_read(offset, data), - Self::BootTimer(x) => x.bus_read(offset, data), - Self::MmioTransport(x) => x.bus_read(offset, data), Self::Serial(x) => x.bus_read(offset, data), #[cfg(test)] - Self::Dummy(x) => x.bus_read(offset, data), #[cfg(test)] Self::Constant(x) => x.bus_read(offset, data), } @@ -186,12 +157,8 @@ impl BusDevice { Self::I8042Device(x) => x.bus_write(offset, data), #[cfg(target_arch = "aarch64")] Self::RTCDevice(x) => x.bus_write(offset, data), - Self::BootTimer(x) => x.bus_write(offset, data), - Self::MmioTransport(x) => x.bus_write(offset, data), Self::Serial(x) => x.bus_write(offset, data), #[cfg(test)] - Self::Dummy(x) => x.bus_write(offset, data), - #[cfg(test)] Self::Constant(x) => x.bus_write(offset, data), } } @@ -314,7 +281,7 @@ mod tests { #[test] fn bus_insert() { let mut bus = Bus::new(); - let dummy = Arc::new(Mutex::new(BusDevice::Dummy(DummyDevice))); + let dummy = Arc::new(Mutex::new(BusDevice::Constant(ConstantDevice))); // Insert len should not be 0. bus.insert(dummy.clone(), 0x10, 0).unwrap_err(); bus.insert(dummy.clone(), 0x10, 0x10).unwrap(); @@ -338,23 +305,6 @@ mod tests { bus.insert(dummy, 0x0, 0x10).unwrap(); } - #[test] - fn bus_read_write() { - let mut bus = Bus::new(); - let dummy = Arc::new(Mutex::new(BusDevice::Dummy(DummyDevice))); - bus.insert(dummy, 0x10, 0x10).unwrap(); - assert!(bus.read(0x10, &mut [0, 0, 0, 0])); - assert!(bus.write(0x10, &[0, 0, 0, 0])); - assert!(bus.read(0x11, &mut [0, 0, 0, 0])); - assert!(bus.write(0x11, &[0, 0, 0, 0])); - assert!(bus.read(0x16, &mut [0, 0, 0, 0])); - assert!(bus.write(0x16, &[0, 0, 0, 0])); - assert!(!bus.read(0x20, &mut [0, 0, 0, 0])); - assert!(!bus.write(0x20, &[0, 0, 0, 0])); - assert!(!bus.read(0x06, &mut [0, 0, 0, 0])); - assert!(!bus.write(0x06, &[0, 0, 0, 0])); - } - #[test] fn bus_read_write_values() { let mut bus = Bus::new(); @@ -381,7 +331,7 @@ mod tests { let mut bus = Bus::new(); let mut data = [1, 2, 3, 4]; bus.insert( - Arc::new(Mutex::new(BusDevice::Dummy(DummyDevice))), + Arc::new(Mutex::new(BusDevice::Constant(ConstantDevice))), 0x10, 0x10, ) diff --git a/src/vmm/src/devices/legacy/rtc_pl031.rs b/src/vmm/src/devices/legacy/rtc_pl031.rs index 754899a23a4..b7ebc827e85 100644 --- a/src/vmm/src/devices/legacy/rtc_pl031.rs +++ b/src/vmm/src/devices/legacy/rtc_pl031.rs @@ -80,7 +80,7 @@ impl RTCDevice { pub fn bus_read(&mut self, offset: u64, data: &mut [u8]) { if let (Ok(offset), 4) = (u16::try_from(offset), data.len()) { // read() function from RTC implementation expects a slice of - // len 4, and we just validated that this is the data lengt + // len 4, and we just validated that this is the data length self.read(offset, data.try_into().unwrap()) } else { warn!( @@ -108,6 +108,23 @@ impl RTCDevice { } } +#[cfg(target_arch = "aarch64")] +impl vm_device::BusDevice for RTCDevice { + fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { + self.bus_read(offset, data) + } + + fn write( + &mut self, + _base: u64, + offset: u64, + data: &[u8], + ) -> Option> { + self.bus_write(offset, data); + None + } +} + #[cfg(test)] mod tests { use vm_superio::Rtc; diff --git a/src/vmm/src/devices/legacy/serial.rs b/src/vmm/src/devices/legacy/serial.rs index c73534e76c4..b895635e56b 100644 --- a/src/vmm/src/devices/legacy/serial.rs +++ b/src/vmm/src/devices/legacy/serial.rs @@ -382,6 +382,25 @@ impl } } +#[cfg(target_arch = "aarch64")] +impl vm_device::BusDevice + for SerialWrapper +{ + fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { + self.bus_read(offset, data) + } + + fn write( + &mut self, + _base: u64, + offset: u64, + data: &[u8], + ) -> Option> { + self.bus_write(offset, data); + None + } +} + #[cfg(test)] mod tests { #![allow(clippy::undocumented_unsafe_blocks)] diff --git a/src/vmm/src/devices/pseudo/boot_timer.rs b/src/vmm/src/devices/pseudo/boot_timer.rs index ba16e92355f..f0cf38977b5 100644 --- a/src/vmm/src/devices/pseudo/boot_timer.rs +++ b/src/vmm/src/devices/pseudo/boot_timer.rs @@ -1,6 +1,8 @@ // Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +use std::sync::{Arc, Barrier}; + use utils::time::TimestampUs; use crate::logger::info; @@ -8,16 +10,16 @@ use crate::logger::info; const MAGIC_VALUE_SIGNAL_GUEST_BOOT_COMPLETE: u8 = 123; /// Pseudo device to record the kernel boot time. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct BootTimer { start_ts: TimestampUs, } -impl BootTimer { - pub fn bus_write(&mut self, offset: u64, data: &[u8]) { +impl vm_device::BusDevice for BootTimer { + fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { // Only handle byte length instructions at a zero offset. if data.len() != 1 || offset != 0 { - return; + return None; } if data[0] == MAGIC_VALUE_SIGNAL_GUEST_BOOT_COMPLETE { @@ -33,8 +35,11 @@ impl BootTimer { boot_time_cpu_us / 1000 ); } + + None } - pub fn bus_read(&mut self, _offset: u64, _data: &[u8]) {} + + fn read(&mut self, _base: u64, _offset: u64, _data: &mut [u8]) {} } impl BootTimer { diff --git a/src/vmm/src/devices/virtio/transport/mmio.rs b/src/vmm/src/devices/virtio/transport/mmio.rs index 07cb03fbdbb..5ecc3fa8ffe 100644 --- a/src/vmm/src/devices/virtio/transport/mmio.rs +++ b/src/vmm/src/devices/virtio/transport/mmio.rs @@ -7,7 +7,7 @@ use std::fmt::Debug; use std::sync::atomic::{AtomicU32, Ordering}; -use std::sync::{Arc, Mutex, MutexGuard}; +use std::sync::{Arc, Barrier, Mutex, MutexGuard}; use vmm_sys_util::eventfd::EventFd; @@ -47,7 +47,7 @@ const MMIO_VERSION: u32 = 2; /// /// Typically one page (4096 bytes) of MMIO address space is sufficient to handle this transport /// and inner virtio device. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct MmioTransport { device: Arc>, // The register where feature bits are stored. @@ -232,8 +232,8 @@ impl MmioTransport { } } -impl MmioTransport { - pub fn bus_read(&mut self, offset: u64, data: &mut [u8]) { +impl vm_device::BusDevice for MmioTransport { + fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) { match offset { 0x00..=0xff if data.len() == 4 => { let v = match offset { @@ -287,12 +287,15 @@ impl MmioTransport { } 0x100..=0xfff => self.locked_device().read_config(offset - 0x100, data), _ => { - warn!("invalid virtio mmio read: {:#x}:{:#x}", offset, data.len()); + warn!( + "invalid virtio mmio read: {base:#x}:{offset:#x}:{:#x}", + data.len() + ); } }; } - pub fn bus_write(&mut self, offset: u64, data: &[u8]) { + fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option> { fn hi(v: &mut GuestAddress, x: u32) { *v = (*v & 0xffff_ffff) | (u64::from(x) << 32) } @@ -354,9 +357,13 @@ impl MmioTransport { } } _ => { - warn!("invalid virtio mmio write: {:#x}:{:#x}", offset, data.len()); + warn!( + "invalid virtio mmio write: {base:#x}:{offset:#x}:{:#x}", + data.len() + ); } } + None } } @@ -455,6 +462,7 @@ pub(crate) mod tests { use std::ops::Deref; + use vm_device::BusDevice; use vmm_sys_util::eventfd::EventFd; use super::*; @@ -567,7 +575,7 @@ pub(crate) mod tests { fn set_device_status(d: &mut MmioTransport, status: u32) { let mut buf = [0; 4]; write_le_u32(&mut buf[..], status); - d.bus_write(0x70, &buf[..]); + d.write(0x0, 0x70, &buf[..]); } #[test] @@ -615,7 +623,7 @@ pub(crate) mod tests { // The following read shouldn't be valid, because the length of the buf is not 4. buf.push(0); - d.bus_read(0, &mut buf[..]); + d.read(0x0, 0, &mut buf[..]); assert_eq!(buf[..4], buf_copy[..]); // the length is ok again @@ -623,74 +631,74 @@ pub(crate) mod tests { // Now we test that reading at various predefined offsets works as intended. - d.bus_read(0, &mut buf[..]); + d.read(0x0, 0, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), MMIO_MAGIC_VALUE); - d.bus_read(0x04, &mut buf[..]); + d.read(0x0, 0x04, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), MMIO_VERSION); - d.bus_read(0x08, &mut buf[..]); + d.read(0x0, 0x08, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), d.locked_device().device_type()); - d.bus_read(0x0c, &mut buf[..]); + d.read(0x0, 0x0c, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), VENDOR_ID); d.features_select = 0; - d.bus_read(0x10, &mut buf[..]); + d.read(0x0, 0x10, &mut buf[..]); assert_eq!( read_le_u32(&buf[..]), d.locked_device().avail_features_by_page(0) ); d.features_select = 1; - d.bus_read(0x10, &mut buf[..]); + d.read(0x0, 0x10, &mut buf[..]); assert_eq!( read_le_u32(&buf[..]), d.locked_device().avail_features_by_page(0) | 0x1 ); - d.bus_read(0x34, &mut buf[..]); + d.read(0x0, 0x34, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), 16); - d.bus_read(0x44, &mut buf[..]); + d.read(0x0, 0x44, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), u32::from(false)); d.interrupt.irq_status.store(111, Ordering::SeqCst); - d.bus_read(0x60, &mut buf[..]); + d.read(0x0, 0x60, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), 111); d.is_vhost_user = true; - d.interrupt.irq_status.store(0, Ordering::SeqCst); - d.bus_read(0x60, &mut buf[..]); + d.interrupt.status().store(0, Ordering::SeqCst); + d.read(0x0, 0x60, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), VIRTIO_MMIO_INT_VRING); d.is_vhost_user = true; d.interrupt .irq_status .store(VIRTIO_MMIO_INT_CONFIG, Ordering::SeqCst); - d.bus_read(0x60, &mut buf[..]); + d.read(0x0, 0x60, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), VIRTIO_MMIO_INT_CONFIG); - d.bus_read(0x70, &mut buf[..]); + d.read(0x0, 0x70, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), 0); d.config_generation = 5; - d.bus_read(0xfc, &mut buf[..]); + d.read(0x0, 0xfc, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), 5); // This read shouldn't do anything, as it's past the readable generic registers, and // before the device specific configuration space. Btw, reads from the device specific // conf space are going to be tested a bit later, alongside writes. buf = buf_copy.to_vec(); - d.bus_read(0xfd, &mut buf[..]); + d.read(0x0, 0xfd, &mut buf[..]); assert_eq!(buf[..], buf_copy[..]); // Read from an invalid address in generic register range. - d.bus_read(0xfb, &mut buf[..]); + d.read(0x0, 0xfb, &mut buf[..]); assert_eq!(buf[..], buf_copy[..]); // Read from an invalid length in generic register range. - d.bus_read(0xfc, &mut buf[..3]); + d.read(0x0, 0xfc, &mut buf[..3]); assert_eq!(buf[..], buf_copy[..]); } @@ -706,7 +714,7 @@ pub(crate) mod tests { // Nothing should happen, because the slice len > 4. d.features_select = 0; - d.bus_write(0x14, &buf[..]); + d.write(0x0, 0x14, &buf[..]); assert_eq!(d.features_select, 0); buf.pop(); @@ -718,7 +726,7 @@ pub(crate) mod tests { assert_eq!(d.locked_device().acked_features(), 0x0); d.acked_features_select = 0x0; write_le_u32(&mut buf[..], 1); - d.bus_write(0x20, &buf[..]); + d.write(0x0, 0x20, &buf[..]); assert_eq!(d.locked_device().acked_features(), 0x0); // Write to device specific configuration space should be ignored before setting @@ -727,8 +735,8 @@ pub(crate) mod tests { for i in (0..0xeff).rev() { let mut buf2 = vec![0; 0xeff]; - d.bus_write(0x100 + i as u64, &buf1[i..]); - d.bus_read(0x100, &mut buf2[..]); + d.write(0x0, 0x100 + i as u64, &buf1[i..]); + d.read(0x0, 0x100, &mut buf2[..]); for item in buf2.iter().take(0xeff) { assert_eq!(*item, 0); @@ -744,7 +752,7 @@ pub(crate) mod tests { // now writes should work d.features_select = 0; write_le_u32(&mut buf[..], 1); - d.bus_write(0x14, &buf[..]); + d.write(0x0, 0x14, &buf[..]); assert_eq!(d.features_select, 1); // Test acknowledging features on bus. @@ -753,12 +761,12 @@ pub(crate) mod tests { // Set the device available features in order to make acknowledging possible. dummy_dev.lock().unwrap().set_avail_features(0x124); - d.bus_write(0x20, &buf[..]); + d.write(0x0, 0x20, &buf[..]); assert_eq!(d.locked_device().acked_features(), 0x124); d.acked_features_select = 0; write_le_u32(&mut buf[..], 2); - d.bus_write(0x24, &buf[..]); + d.write(0x0, 0x24, &buf[..]); assert_eq!(d.acked_features_select, 2); set_device_status( &mut d, @@ -769,31 +777,31 @@ pub(crate) mod tests { assert_eq!(d.locked_device().acked_features(), 0x124); d.acked_features_select = 0x0; write_le_u32(&mut buf[..], 1); - d.bus_write(0x20, &buf[..]); + d.write(0x0, 0x20, &buf[..]); assert_eq!(d.locked_device().acked_features(), 0x124); // Setup queues d.queue_select = 0; write_le_u32(&mut buf[..], 3); - d.bus_write(0x30, &buf[..]); + d.write(0x0, 0x30, &buf[..]); assert_eq!(d.queue_select, 3); d.queue_select = 0; assert_eq!(d.locked_device().queues()[0].size, 0); write_le_u32(&mut buf[..], 16); - d.bus_write(0x38, &buf[..]); + d.write(0x0, 0x38, &buf[..]); assert_eq!(d.locked_device().queues()[0].size, 16); assert!(!d.locked_device().queues()[0].ready); write_le_u32(&mut buf[..], 1); - d.bus_write(0x44, &buf[..]); + d.write(0x0, 0x44, &buf[..]); assert!(d.locked_device().queues()[0].ready); assert_eq!(d.locked_device().queues()[0].desc_table_address.0, 0); write_le_u32(&mut buf[..], 123); - d.bus_write(0x80, &buf[..]); + d.write(0x0, 0x80, &buf[..]); assert_eq!(d.locked_device().queues()[0].desc_table_address.0, 123); - d.bus_write(0x84, &buf[..]); + d.write(0x0, 0x84, &buf[..]); assert_eq!( d.locked_device().queues()[0].desc_table_address.0, 123 + (123 << 32) @@ -801,9 +809,9 @@ pub(crate) mod tests { assert_eq!(d.locked_device().queues()[0].avail_ring_address.0, 0); write_le_u32(&mut buf[..], 124); - d.bus_write(0x90, &buf[..]); + d.write(0x0, 0x90, &buf[..]); assert_eq!(d.locked_device().queues()[0].avail_ring_address.0, 124); - d.bus_write(0x94, &buf[..]); + d.write(0x0, 0x94, &buf[..]); assert_eq!( d.locked_device().queues()[0].avail_ring_address.0, 124 + (124 << 32) @@ -811,9 +819,9 @@ pub(crate) mod tests { assert_eq!(d.locked_device().queues()[0].used_ring_address.0, 0); write_le_u32(&mut buf[..], 125); - d.bus_write(0xa0, &buf[..]); + d.write(0x0, 0xa0, &buf[..]); assert_eq!(d.locked_device().queues()[0].used_ring_address.0, 125); - d.bus_write(0xa4, &buf[..]); + d.write(0x0, 0xa4, &buf[..]); assert_eq!( d.locked_device().queues()[0].used_ring_address.0, 125 + (125 << 32) @@ -829,17 +837,17 @@ pub(crate) mod tests { d.interrupt.irq_status.store(0b10_1010, Ordering::Relaxed); write_le_u32(&mut buf[..], 0b111); - d.bus_write(0x64, &buf[..]); + d.write(0x0, 0x64, &buf[..]); assert_eq!(d.interrupt.irq_status.load(Ordering::Relaxed), 0b10_1000); // Write to an invalid address in generic register range. write_le_u32(&mut buf[..], 0xf); d.config_generation = 0; - d.bus_write(0xfb, &buf[..]); + d.write(0x0, 0xfb, &buf[..]); assert_eq!(d.config_generation, 0); // Write to an invalid length in generic register range. - d.bus_write(0xfc, &buf[..2]); + d.write(0x0, 0xfc, &buf[..2]); assert_eq!(d.config_generation, 0); // Here we test writes/read into/from the device specific configuration space. @@ -847,8 +855,8 @@ pub(crate) mod tests { for i in (0..0xeff).rev() { let mut buf2 = vec![0; 0xeff]; - d.bus_write(0x100 + i as u64, &buf1[i..]); - d.bus_read(0x100, &mut buf2[..]); + d.write(0x0, 0x100 + i as u64, &buf1[i..]); + d.read(0x0, 0x100, &mut buf2[..]); for item in buf2.iter().take(i) { assert_eq!(*item, 0); @@ -903,17 +911,17 @@ pub(crate) mod tests { for q in 0..queue_len { d.queue_select = q.try_into().unwrap(); write_le_u32(&mut buf[..], 16); - d.bus_write(0x38, &buf[..]); + d.write(0x0, 0x38, &buf[..]); write_le_u32(&mut buf[..], 1); - d.bus_write(0x44, &buf[..]); + d.write(0x0, 0x44, &buf[..]); } assert!(!d.locked_device().is_activated()); // Device should be ready for activation now. // A couple of invalid writes; will trigger warnings; shouldn't activate the device. - d.bus_write(0xa8, &buf[..]); - d.bus_write(0x1000, &buf[..]); + d.write(0x0, 0xa8, &buf[..]); + d.write(0x0, 0x1000, &buf[..]); assert!(!d.locked_device().is_activated()); set_device_status( @@ -936,8 +944,8 @@ pub(crate) mod tests { // a warning path and have no effect on queue state. write_le_u32(&mut buf[..], 0); d.queue_select = 0; - d.bus_write(0x44, &buf[..]); - d.bus_read(0x44, &mut buf[..]); + d.write(0x0, 0x44, &buf[..]); + d.read(0x0, 0x44, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), 1); } @@ -963,9 +971,9 @@ pub(crate) mod tests { for q in 0..queue_len { d.queue_select = q.try_into().unwrap(); write_le_u32(&mut buf[..], 16); - d.bus_write(0x38, &buf[..]); + d.write(0x0, 0x38, &buf[..]); write_le_u32(&mut buf[..], 1); - d.bus_write(0x44, &buf[..]); + d.write(0x0, 0x44, &buf[..]); } assert!(!d.locked_device().is_activated()); @@ -1010,9 +1018,9 @@ pub(crate) mod tests { for q in 0..queues_count { d.queue_select = q.try_into().unwrap(); write_le_u32(&mut buf[..], 16); - d.bus_write(0x38, &buf[..]); + d.write(0x0, 0x38, &buf[..]); write_le_u32(&mut buf[..], 1); - d.bus_write(0x44, &buf[..]); + d.write(0x0, 0x44, &buf[..]); } assert!(!d.locked_device().is_activated()); @@ -1052,13 +1060,13 @@ pub(crate) mod tests { // Marking device as FAILED should not affect device_activated state write_le_u32(&mut buf[..], 0x8f); - d.bus_write(0x70, &buf[..]); + d.write(0x0, 0x70, &buf[..]); assert_eq!(d.device_status, 0x8f); assert!(d.locked_device().is_activated()); // Nothing happens when backend driver doesn't support reset write_le_u32(&mut buf[..], 0x0); - d.bus_write(0x70, &buf[..]); + d.write(0x0, 0x70, &buf[..]); assert_eq!(d.device_status, 0x8f); assert!(d.locked_device().is_activated()); } diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 2a923637e93..964d6ab67cc 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -124,6 +124,7 @@ use std::time::Duration; use device_manager::acpi::ACPIDeviceManager; use device_manager::resources::ResourceAllocator; use devices::acpi::vmgenid::VmGenIdError; +use devices::virtio::device::VirtioDevice; use event_manager::{EventManager as BaseEventManager, EventOps, Events, MutEventSubscriber}; use seccomp::BpfProgram; use userfaultfd::Uffd; @@ -133,7 +134,6 @@ use vmm_sys_util::terminal::Terminal; use vstate::kvm::Kvm; use vstate::vcpu::{self, StartThreadedError, VcpuSendEventError}; -use crate::arch::DeviceType; use crate::cpu_config::templates::CpuConfiguration; #[cfg(target_arch = "x86_64")] use crate::device_manager::legacy::PortIODeviceManager; @@ -340,12 +340,16 @@ impl Vmm { } /// Gets the specified bus device. - pub fn get_bus_device( + pub fn get_virtio_device( &self, - device_type: DeviceType, + device_type: u32, device_id: &str, - ) -> Option<&Mutex> { - self.mmio_device_manager.get_device(device_type, device_id) + ) -> Option>> { + let device = self + .mmio_device_manager + .get_virtio_device(device_type, device_id)?; + + Some(device.inner.lock().expect("Poisoned lock").device().clone()) } /// Starts the microVM vcpus. @@ -450,20 +454,14 @@ impl Vmm { #[cfg(target_arch = "aarch64")] { - let serial_bus_device = self.get_bus_device(DeviceType::Serial, "Serial"); - if serial_bus_device.is_none() { - return Ok(()); - } - let mut serial_device_locked = - serial_bus_device.unwrap().lock().expect("Poisoned lock"); - let serial = serial_device_locked - .serial_mut() - .expect("Unexpected BusDeviceType"); + if let Some(device) = &self.mmio_device_manager.serial { + let mut device_locked = device.inner.lock().expect("Poisoned lock"); - serial - .serial - .write(IER_RDA_OFFSET, IER_RDA_BIT) - .map_err(|_| EmulateSerialInitError(std::io::Error::last_os_error()))?; + device_locked + .serial + .write(IER_RDA_OFFSET, IER_RDA_BIT) + .map_err(|_| EmulateSerialInitError(std::io::Error::last_os_error()))?; + } Ok(()) } @@ -644,15 +642,7 @@ impl Vmm { /// Returns a reference to the balloon device if present. pub fn balloon_config(&self) -> Result { - if let Some(busdev) = self.get_bus_device(DeviceType::Virtio(TYPE_BALLOON), BALLOON_DEV_ID) - { - let virtio_device = busdev - .lock() - .expect("Poisoned lock") - .mmio_transport_ref() - .expect("Unexpected device type") - .device(); - + if let Some(virtio_device) = self.get_virtio_device(TYPE_BALLOON, BALLOON_DEV_ID) { let config = virtio_device .lock() .expect("Poisoned lock") @@ -669,15 +659,7 @@ impl Vmm { /// Returns the latest balloon statistics if they are enabled. pub fn latest_balloon_stats(&self) -> Result { - if let Some(busdev) = self.get_bus_device(DeviceType::Virtio(TYPE_BALLOON), BALLOON_DEV_ID) - { - let virtio_device = busdev - .lock() - .expect("Poisoned lock") - .mmio_transport_ref() - .expect("Unexpected device type") - .device(); - + if let Some(virtio_device) = self.get_virtio_device(TYPE_BALLOON, BALLOON_DEV_ID) { let latest_stats = virtio_device .lock() .expect("Poisoned lock") @@ -702,16 +684,8 @@ impl Vmm { return Err(BalloonError::TooManyPagesRequested); } - if let Some(busdev) = self.get_bus_device(DeviceType::Virtio(TYPE_BALLOON), BALLOON_DEV_ID) - { + if let Some(virtio_device) = self.get_virtio_device(TYPE_BALLOON, BALLOON_DEV_ID) { { - let virtio_device = busdev - .lock() - .expect("Poisoned lock") - .mmio_transport_ref() - .expect("Unexpected device type") - .device(); - virtio_device .lock() .expect("Poisoned lock") @@ -732,16 +706,8 @@ impl Vmm { &mut self, stats_polling_interval_s: u16, ) -> Result<(), BalloonError> { - if let Some(busdev) = self.get_bus_device(DeviceType::Virtio(TYPE_BALLOON), BALLOON_DEV_ID) - { + if let Some(virtio_device) = self.get_virtio_device(TYPE_BALLOON, BALLOON_DEV_ID) { { - let virtio_device = busdev - .lock() - .expect("Poisoned lock") - .mmio_transport_ref() - .expect("Unexpected device type") - .device(); - virtio_device .lock() .expect("Poisoned lock") diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index 1ff158d9973..14af3ecd792 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -69,7 +69,7 @@ impl From<&VmResources> for VmInfo { } } -/// Contains the necesary state for saving/restoring a microVM. +/// Contains the necessary state for saving/restoring a microVM. #[derive(Debug, Default, Serialize, Deserialize)] pub struct MicrovmState { /// Miscellaneous VM info. @@ -172,8 +172,9 @@ pub fn create_snapshot( // This should never fail as we only mark pages only if device has already been activated, // and the address validation was already performed on device activation. vmm.mmio_device_manager - .for_each_virtio_device(|_, _, _, dev| { - let mut d = dev.lock().unwrap(); + .for_each_virtio_device(|_, _, device| { + let mmio_dev_locked = device.inner.lock().expect("Poisoned lock"); + let mut d = mmio_dev_locked.locked_device(); if d.is_activated() { d.mark_queue_memory_dirty(vmm.vm.guest_memory()) } else { diff --git a/src/vmm/src/vstate/vcpu.rs b/src/vmm/src/vstate/vcpu.rs index 8b6298079f3..c578d98fdb3 100644 --- a/src/vmm/src/vstate/vcpu.rs +++ b/src/vmm/src/vstate/vcpu.rs @@ -174,7 +174,7 @@ impl Vcpu { } /// Sets a MMIO bus for this vcpu. - pub fn set_mmio_bus(&mut self, mmio_bus: crate::devices::Bus) { + pub fn set_mmio_bus(&mut self, mmio_bus: Arc) { self.kvm_vcpu.peripherals.mmio_bus = Some(mmio_bus); } @@ -481,7 +481,9 @@ fn handle_kvm_exit( VcpuExit::MmioRead(addr, data) => { if let Some(mmio_bus) = &peripherals.mmio_bus { let _metric = METRICS.vcpu.exit_mmio_read_agg.record_latency_metrics(); - mmio_bus.read(addr, data); + if let Err(err) = mmio_bus.read(addr, data) { + warn!("Invalid MMIO read @ {addr:#x}:{:#x}: {err}", data.len()); + } METRICS.vcpu.exit_mmio_read.inc(); } Ok(VcpuEmulation::Handled) @@ -489,7 +491,9 @@ fn handle_kvm_exit( VcpuExit::MmioWrite(addr, data) => { if let Some(mmio_bus) = &peripherals.mmio_bus { let _metric = METRICS.vcpu.exit_mmio_write_agg.record_latency_metrics(); - mmio_bus.write(addr, data); + if let Err(err) = mmio_bus.write(addr, data) { + warn!("Invalid MMIO read @ {addr:#x}:{:#x}: {err}", data.len()); + } METRICS.vcpu.exit_mmio_write.inc(); } Ok(VcpuEmulation::Handled) @@ -719,7 +723,6 @@ pub(crate) mod tests { use super::*; use crate::RECV_TIMEOUT_SEC; use crate::arch::{BootProtocol, EntryPoint}; - use crate::devices::BusDevice; use crate::devices::bus::DummyDevice; use crate::seccomp::get_empty_filters; use crate::utils::mib_to_bytes; @@ -824,8 +827,8 @@ pub(crate) mod tests { ) ); - let mut bus = crate::devices::Bus::new(); - let dummy = Arc::new(Mutex::new(BusDevice::Dummy(DummyDevice))); + let bus = Arc::new(vm_device::Bus::new()); + let dummy = Arc::new(Mutex::new(DummyDevice)); bus.insert(dummy, 0x10, 0x10).unwrap(); vcpu.set_mmio_bus(bus); let addr = 0x10; @@ -967,7 +970,7 @@ pub(crate) mod tests { fn test_set_mmio_bus() { let (_, _, mut vcpu) = setup_vcpu(0x1000); assert!(vcpu.kvm_vcpu.peripherals.mmio_bus.is_none()); - vcpu.set_mmio_bus(crate::devices::Bus::new()); + vcpu.set_mmio_bus(Arc::new(vm_device::Bus::new())); assert!(vcpu.kvm_vcpu.peripherals.mmio_bus.is_some()); } From b7b21859d43ea6d4df1ea8cf4057fb5193e642dc Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 30 Apr 2025 17:25:59 +0200 Subject: [PATCH 09/99] refactor: simplify creating RTCDevice We were always constructing RTCDevice using a set of metrics that were defined in the RTC module itself. Don't leak the metrics to other modules. Instead, create a new() function that always constructs it the correct way. Signed-off-by: Babis Chalios --- src/vmm/src/builder.rs | 6 +----- src/vmm/src/device_manager/persist.rs | 4 +--- src/vmm/src/devices/legacy/rtc_pl031.rs | 15 ++++++++++++++- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index f867188b9bb..6795a53d778 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -15,8 +15,6 @@ use userfaultfd::Uffd; use utils::time::TimestampUs; #[cfg(target_arch = "aarch64")] use vm_memory::GuestAddress; -#[cfg(target_arch = "aarch64")] -use vm_superio::Rtc; use crate::arch::{ConfigurationError, configure_system_for_boot, load_kernel}; #[cfg(target_arch = "aarch64")] @@ -628,9 +626,7 @@ fn attach_legacy_devices_aarch64( .map_err(VmmError::RegisterMMIODevice)?; } - let rtc = RTCDevice(Rtc::with_events( - &crate::devices::legacy::rtc_pl031::METRICS, - )); + let rtc = RTCDevice::new(); vmm.mmio_device_manager .register_mmio_rtc(&mut vmm.resource_allocator, rtc, None) .map_err(VmmError::RegisterMMIODevice) diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 432301b66a1..6a54a67b33d 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -431,9 +431,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { )?; } if state.type_ == DeviceType::Rtc { - let rtc = RTCDevice(vm_superio::Rtc::with_events( - &crate::devices::legacy::rtc_pl031::METRICS, - )); + let rtc = RTCDevice::new(); constructor_args .resource_allocator .allocate_mmio_memory( diff --git a/src/vmm/src/devices/legacy/rtc_pl031.rs b/src/vmm/src/devices/legacy/rtc_pl031.rs index b7ebc827e85..b025c1d1512 100644 --- a/src/vmm/src/devices/legacy/rtc_pl031.rs +++ b/src/vmm/src/devices/legacy/rtc_pl031.rs @@ -4,6 +4,7 @@ use std::convert::TryInto; use serde::Serialize; +use vm_superio::Rtc; use vm_superio::rtc_pl031::RtcEvents; use crate::logger::{IncMetric, SharedIncMetric, warn}; @@ -59,7 +60,19 @@ pub static METRICS: RTCDeviceMetrics = RTCDeviceMetrics::new(); /// Wrapper over vm_superio's RTC implementation. #[derive(Debug)] -pub struct RTCDevice(pub vm_superio::Rtc<&'static RTCDeviceMetrics>); +pub struct RTCDevice(vm_superio::Rtc<&'static RTCDeviceMetrics>); + +impl Default for RTCDevice { + fn default() -> Self { + RTCDevice(Rtc::with_events(&METRICS)) + } +} + +impl RTCDevice { + pub fn new() -> RTCDevice { + Default::default() + } +} impl std::ops::Deref for RTCDevice { type Target = vm_superio::Rtc<&'static RTCDeviceMetrics>; From 62693e8fab9527dda897b5157c7bd5768851c7fe Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Thu, 24 Apr 2025 17:14:25 +0200 Subject: [PATCH 10/99] refactor: use vm_device::Bus for IO bus Use the vm_device::Bus bus for PortIO devices on x86. PCIe devices will use this as well. Signed-off-by: Babis Chalios --- src/vmm/src/arch/x86_64/vcpu.rs | 12 +- src/vmm/src/builder.rs | 24 +- src/vmm/src/device_manager/legacy.rs | 51 ++-- src/vmm/src/devices/bus.rs | 354 --------------------------- src/vmm/src/devices/legacy/i8042.rs | 61 ++--- src/vmm/src/devices/legacy/serial.rs | 33 +-- src/vmm/src/devices/mod.rs | 2 - src/vmm/src/lib.rs | 5 +- src/vmm/src/vstate/vcpu.rs | 12 +- src/vmm/tests/devices.rs | 13 +- src/vmm/tests/integration_tests.rs | 2 +- 11 files changed, 112 insertions(+), 457 deletions(-) delete mode 100644 src/vmm/src/devices/bus.rs diff --git a/src/vmm/src/arch/x86_64/vcpu.rs b/src/vmm/src/arch/x86_64/vcpu.rs index 4eb609aadd6..eea1f24ae69 100644 --- a/src/vmm/src/arch/x86_64/vcpu.rs +++ b/src/vmm/src/arch/x86_64/vcpu.rs @@ -160,7 +160,7 @@ pub struct KvmVcpu { #[derive(Default, Debug)] pub struct Peripherals { /// Pio bus. - pub pio_bus: Option, + pub pio_bus: Option>, /// Mmio bus. pub mmio_bus: Option>, } @@ -267,7 +267,7 @@ impl KvmVcpu { } /// Sets a Port Mapped IO bus for this vcpu. - pub fn set_pio_bus(&mut self, pio_bus: crate::devices::Bus) { + pub fn set_pio_bus(&mut self, pio_bus: Arc) { self.peripherals.pio_bus = Some(pio_bus); } @@ -711,7 +711,9 @@ impl Peripherals { VcpuExit::IoIn(addr, data) => { if let Some(pio_bus) = &self.pio_bus { let _metric = METRICS.vcpu.exit_io_in_agg.record_latency_metrics(); - pio_bus.read(u64::from(addr), data); + if let Err(err) = pio_bus.read(u64::from(addr), data) { + warn!("vcpu: IO read @ {addr:#x}:{:#x} failed: {err}", data.len()); + } METRICS.vcpu.exit_io_in.inc(); } Ok(VcpuEmulation::Handled) @@ -719,7 +721,9 @@ impl Peripherals { VcpuExit::IoOut(addr, data) => { if let Some(pio_bus) = &self.pio_bus { let _metric = METRICS.vcpu.exit_io_out_agg.record_latency_metrics(); - pio_bus.write(u64::from(addr), data); + if let Err(err) = pio_bus.write(u64::from(addr), data) { + warn!("vcpu: IO write @ {addr:#x}:{:#x} failed: {err}", data.len()); + } METRICS.vcpu.exit_io_out.inc(); } Ok(VcpuEmulation::Handled) diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 6795a53d778..285c0df0058 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -15,6 +15,8 @@ use userfaultfd::Uffd; use utils::time::TimestampUs; #[cfg(target_arch = "aarch64")] use vm_memory::GuestAddress; +#[cfg(target_arch = "x86_64")] +use vmm_sys_util::eventfd::EventFd; use crate::arch::{ConfigurationError, configure_system_for_boot, load_kernel}; #[cfg(target_arch = "aarch64")] @@ -30,8 +32,9 @@ use crate::device_manager::persist::{ ACPIDeviceManagerConstructorArgs, ACPIDeviceManagerRestoreError, MMIODevManagerConstructorArgs, }; use crate::device_manager::resources::ResourceAllocator; -use crate::devices::BusDevice; use crate::devices::acpi::vmgenid::{VmGenId, VmGenIdError}; +#[cfg(target_arch = "x86_64")] +use crate::devices::legacy::I8042Device; #[cfg(target_arch = "aarch64")] use crate::devices::legacy::RTCDevice; use crate::devices::legacy::SerialDevice; @@ -162,10 +165,14 @@ fn create_vmm_and_vcpus( // x86_64 uses the i8042 reset event as the Vmm exit event. let reset_evt = vcpus_exit_evt.try_clone().map_err(VmmError::EventFd)?; + let i8042 = Arc::new(Mutex::new(I8042Device::new( + reset_evt, + EventFd::new(libc::EFD_NONBLOCK).map_err(VmmError::EventFd)?, + ))); // create pio dev manager with legacy devices let mut pio_dev_mgr = - PortIODeviceManager::new(serial_device, reset_evt).map_err(VmmError::LegacyIOBus)?; + PortIODeviceManager::new(serial_device, i8042).map_err(VmmError::LegacyIOBus)?; pio_dev_mgr .register_devices(vm.fd()) .map_err(VmmError::LegacyIOBus)?; @@ -549,11 +556,11 @@ pub fn build_microvm_from_snapshot( /// Sets up the serial device. pub fn setup_serial_device( event_manager: &mut EventManager, -) -> Result>, VmmError> { - let serial = Arc::new(Mutex::new(BusDevice::Serial( +) -> Result>, VmmError> { + let serial = Arc::new(Mutex::new( SerialDevice::new(Some(std::io::stdin()), SerialOut::Stdout(std::io::stdout())) .map_err(VmmError::EventFd)?, - ))); + )); event_manager.add_subscriber(serial.clone()); Ok(serial) } @@ -879,10 +886,13 @@ pub(crate) mod tests { let acpi_device_manager = ACPIDeviceManager::new(); #[cfg(target_arch = "x86_64")] let pio_device_manager = PortIODeviceManager::new( - Arc::new(Mutex::new(BusDevice::Serial( + Arc::new(Mutex::new( SerialDevice::new(None, SerialOut::Sink(std::io::sink())).unwrap(), + )), + Arc::new(Mutex::new(I8042Device::new( + EventFd::new(libc::EFD_NONBLOCK).unwrap(), + EventFd::new(libc::EFD_NONBLOCK).unwrap(), ))), - EventFd::new(libc::EFD_NONBLOCK).unwrap(), ) .unwrap(); diff --git a/src/vmm/src/device_manager/legacy.rs b/src/vmm/src/device_manager/legacy.rs index 20b008769a5..0af1ae3348a 100644 --- a/src/vmm/src/device_manager/legacy.rs +++ b/src/vmm/src/device_manager/legacy.rs @@ -16,15 +16,14 @@ use libc::EFD_NONBLOCK; use vm_superio::Serial; use vmm_sys_util::eventfd::EventFd; -use crate::devices::bus::BusDevice; use crate::devices::legacy::serial::SerialOut; -use crate::devices::legacy::{EventFdTrigger, SerialDevice, SerialEventsWrapper}; +use crate::devices::legacy::{EventFdTrigger, I8042Device, SerialDevice, SerialEventsWrapper}; /// Errors corresponding to the `PortIODeviceManager`. #[derive(Debug, derive_more::From, thiserror::Error, displaydoc::Display)] pub enum LegacyDeviceError { /// Failed to add legacy device to Bus: {0} - BusError(crate::devices::BusError), + BusError(vm_device::BusError), /// Failed to create EventFd: {0} EventFd(std::io::Error), } @@ -34,11 +33,11 @@ pub enum LegacyDeviceError { /// The `LegacyDeviceManger` should be initialized only by using the constructor. #[derive(Debug)] pub struct PortIODeviceManager { - pub io_bus: crate::devices::Bus, + pub io_bus: Arc, // BusDevice::Serial - pub stdio_serial: Arc>, + pub stdio_serial: Arc>, // BusDevice::I8042Device - pub i8042: Arc>, + pub i8042: Arc>, // Communication event on ports 1 & 3. pub com_evt_1_3: EventFdTrigger, @@ -73,29 +72,26 @@ impl PortIODeviceManager { /// Create a new DeviceManager handling legacy devices (uart, i8042). pub fn new( - serial: Arc>, - i8042_reset_evfd: EventFd, + stdio_serial: Arc>, + i8042: Arc>, ) -> Result { - debug_assert!(matches!(*serial.lock().unwrap(), BusDevice::Serial(_))); - let io_bus = crate::devices::Bus::new(); - let com_evt_1_3 = serial + let io_bus = Arc::new(vm_device::Bus::new()); + let com_evt_1_3 = stdio_serial .lock() .expect("Poisoned lock") - .serial_mut() - .unwrap() .serial .interrupt_evt() .try_clone()?; let com_evt_2_4 = EventFdTrigger::new(EventFd::new(EFD_NONBLOCK)?); - let kbd_evt = EventFd::new(libc::EFD_NONBLOCK)?; - - let i8042 = Arc::new(Mutex::new(BusDevice::I8042Device( - crate::devices::legacy::I8042Device::new(i8042_reset_evfd, kbd_evt.try_clone()?), - ))); + let kbd_evt = i8042 + .lock() + .expect("Poisoned lock") + .kbd_interrupt_evt + .try_clone()?; Ok(PortIODeviceManager { io_bus, - stdio_serial: serial, + stdio_serial, i8042, com_evt_1_3, com_evt_2_4, @@ -105,7 +101,7 @@ impl PortIODeviceManager { /// Register supported legacy devices. pub fn register_devices(&mut self, vm_fd: &VmFd) -> Result<(), LegacyDeviceError> { - let serial_2_4 = Arc::new(Mutex::new(BusDevice::Serial(SerialDevice { + let serial_2_4 = Arc::new(Mutex::new(SerialDevice { serial: Serial::with_events( self.com_evt_2_4.try_clone()?.try_clone()?, SerialEventsWrapper { @@ -114,8 +110,8 @@ impl PortIODeviceManager { SerialOut::Sink(std::io::sink()), ), input: None, - }))); - let serial_1_3 = Arc::new(Mutex::new(BusDevice::Serial(SerialDevice { + })); + let serial_1_3 = Arc::new(Mutex::new(SerialDevice { serial: Serial::with_events( self.com_evt_1_3.try_clone()?.try_clone()?, SerialEventsWrapper { @@ -124,7 +120,7 @@ impl PortIODeviceManager { SerialOut::Sink(std::io::sink()), ), input: None, - }))); + })); self.io_bus.insert( self.stdio_serial.clone(), Self::SERIAL_PORT_ADDRESSES[0], @@ -251,7 +247,7 @@ mod tests { let (_, vm) = setup_vm_with_memory(0x1000); vm.setup_irqchip().unwrap(); let mut ldm = PortIODeviceManager::new( - Arc::new(Mutex::new(BusDevice::Serial(SerialDevice { + Arc::new(Mutex::new(SerialDevice { serial: Serial::with_events( EventFdTrigger::new(EventFd::new(EFD_NONBLOCK).unwrap()), SerialEventsWrapper { @@ -260,8 +256,11 @@ mod tests { SerialOut::Sink(std::io::sink()), ), input: None, - }))), - EventFd::new(libc::EFD_NONBLOCK).unwrap(), + })), + Arc::new(Mutex::new(I8042Device::new( + EventFd::new(libc::EFD_NONBLOCK).unwrap(), + EventFd::new(libc::EFD_NONBLOCK).unwrap(), + ))), ) .unwrap(); ldm.register_devices(vm.fd()).unwrap(); diff --git a/src/vmm/src/devices/bus.rs b/src/vmm/src/devices/bus.rs deleted file mode 100644 index 6f7e1531bf3..00000000000 --- a/src/vmm/src/devices/bus.rs +++ /dev/null @@ -1,354 +0,0 @@ -// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 -// -// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the THIRD-PARTY file. - -//! Handles routing to devices in an address space. - -use std::cmp::{Ord, Ordering, PartialEq, PartialOrd}; -use std::collections::btree_map::BTreeMap; -#[cfg(test)] -use std::sync::Barrier; -use std::sync::{Arc, Mutex}; - -/// Errors triggered during bus operations. -#[derive(Debug, thiserror::Error, displaydoc::Display)] -pub enum BusError { - /// New device overlaps with an old device. - Overlap, -} - -#[derive(Debug, Copy, Clone)] -struct BusRange(u64, u64); - -impl Eq for BusRange {} - -impl PartialEq for BusRange { - fn eq(&self, other: &BusRange) -> bool { - self.0 == other.0 - } -} - -impl Ord for BusRange { - fn cmp(&self, other: &BusRange) -> Ordering { - self.0.cmp(&other.0) - } -} - -impl PartialOrd for BusRange { - fn partial_cmp(&self, other: &BusRange) -> Option { - Some(self.cmp(other)) - } -} - -/// A device container for routing reads and writes over some address space. -/// -/// This doesn't have any restrictions on what kind of device or address space this applies to. The -/// only restriction is that no two devices can overlap in this address space. -#[derive(Debug, Clone, Default)] -pub struct Bus { - devices: BTreeMap>>, -} - -use event_manager::{EventOps, Events, MutEventSubscriber}; - -#[cfg(target_arch = "aarch64")] -use super::legacy::RTCDevice; -use super::legacy::{I8042Device, SerialDevice}; - -#[derive(Debug)] -pub enum BusDevice { - I8042Device(I8042Device), - #[cfg(target_arch = "aarch64")] - RTCDevice(RTCDevice), - Serial(SerialDevice), - #[cfg(test)] - Constant(ConstantDevice), -} - -#[cfg(test)] -#[derive(Debug)] -pub struct DummyDevice; - -#[cfg(test)] -impl vm_device::BusDevice for DummyDevice { - fn write(&mut self, _base: u64, _offset: u64, _data: &[u8]) -> Option> { - None - } - fn read(&mut self, _base: u64, _offset: u64, _data: &mut [u8]) {} -} - -#[cfg(test)] -#[derive(Debug)] -pub struct ConstantDevice; - -#[cfg(test)] -impl ConstantDevice { - pub fn bus_read(&mut self, offset: u64, data: &mut [u8]) { - for (i, v) in data.iter_mut().enumerate() { - *v = ((offset + i as u64) & 0xff) as u8; - } - } - - fn bus_write(&mut self, offset: u64, data: &[u8]) { - for (i, v) in data.iter().enumerate() { - assert_eq!(*v, ((offset + i as u64) & 0xff) as u8) - } - } -} - -impl BusDevice { - pub fn i8042_device_ref(&self) -> Option<&I8042Device> { - match self { - Self::I8042Device(x) => Some(x), - _ => None, - } - } - #[cfg(target_arch = "aarch64")] - pub fn rtc_device_ref(&self) -> Option<&RTCDevice> { - match self { - Self::RTCDevice(x) => Some(x), - _ => None, - } - } - pub fn serial_ref(&self) -> Option<&SerialDevice> { - match self { - Self::Serial(x) => Some(x), - _ => None, - } - } - - pub fn i8042_device_mut(&mut self) -> Option<&mut I8042Device> { - match self { - Self::I8042Device(x) => Some(x), - _ => None, - } - } - #[cfg(target_arch = "aarch64")] - pub fn rtc_device_mut(&mut self) -> Option<&mut RTCDevice> { - match self { - Self::RTCDevice(x) => Some(x), - _ => None, - } - } - pub fn serial_mut(&mut self) -> Option<&mut SerialDevice> { - match self { - Self::Serial(x) => Some(x), - _ => None, - } - } - - pub fn read(&mut self, offset: u64, data: &mut [u8]) { - match self { - Self::I8042Device(x) => x.bus_read(offset, data), - #[cfg(target_arch = "aarch64")] - Self::RTCDevice(x) => x.bus_read(offset, data), - Self::Serial(x) => x.bus_read(offset, data), - #[cfg(test)] - #[cfg(test)] - Self::Constant(x) => x.bus_read(offset, data), - } - } - - pub fn write(&mut self, offset: u64, data: &[u8]) { - match self { - Self::I8042Device(x) => x.bus_write(offset, data), - #[cfg(target_arch = "aarch64")] - Self::RTCDevice(x) => x.bus_write(offset, data), - Self::Serial(x) => x.bus_write(offset, data), - #[cfg(test)] - Self::Constant(x) => x.bus_write(offset, data), - } - } -} - -impl MutEventSubscriber for BusDevice { - fn process(&mut self, event: Events, ops: &mut EventOps) { - match self { - Self::Serial(serial) => serial.process(event, ops), - _ => panic!(), - } - } - fn init(&mut self, ops: &mut EventOps) { - match self { - Self::Serial(serial) => serial.init(ops), - _ => panic!(), - } - } -} - -impl Bus { - /// Constructs an a bus with an empty address space. - pub fn new() -> Bus { - Bus { - devices: BTreeMap::new(), - } - } - - fn first_before(&self, addr: u64) -> Option<(BusRange, &Mutex)> { - // for when we switch to rustc 1.17: self.devices.range(..addr).iter().rev().next() - for (range, dev) in self.devices.iter().rev() { - if range.0 <= addr { - return Some((*range, dev)); - } - } - None - } - - /// Returns the device found at some address. - pub fn get_device(&self, addr: u64) -> Option<(u64, &Mutex)> { - if let Some((BusRange(start, len), dev)) = self.first_before(addr) { - let offset = addr - start; - if offset < len { - return Some((offset, dev)); - } - } - None - } - - /// Puts the given device at the given address space. - pub fn insert( - &mut self, - device: Arc>, - base: u64, - len: u64, - ) -> Result<(), BusError> { - if len == 0 { - return Err(BusError::Overlap); - } - - // Reject all cases where the new device's base is within an old device's range. - if self.get_device(base).is_some() { - return Err(BusError::Overlap); - } - - // The above check will miss an overlap in which the new device's base address is before the - // range of another device. To catch that case, we search for a device with a range before - // the new device's range's end. If there is no existing device in that range that starts - // after the new device, then there will be no overlap. - if let Some((BusRange(start, _), _)) = self.first_before(base + len - 1) { - // Such a device only conflicts with the new device if it also starts after the new - // device because of our initial `get_device` check above. - if start >= base { - return Err(BusError::Overlap); - } - } - - if self.devices.insert(BusRange(base, len), device).is_some() { - return Err(BusError::Overlap); - } - - Ok(()) - } - - /// Reads data from the device that owns the range containing `addr` and puts it into `data`. - /// - /// Returns true on success, otherwise `data` is untouched. - pub fn read(&self, addr: u64, data: &mut [u8]) -> bool { - if let Some((offset, dev)) = self.get_device(addr) { - // OK to unwrap as lock() failing is a serious error condition and should panic. - dev.lock() - .expect("Failed to acquire device lock") - .read(offset, data); - true - } else { - false - } - } - - /// Writes `data` to the device that owns the range containing `addr`. - /// - /// Returns true on success, otherwise `data` is untouched. - pub fn write(&self, addr: u64, data: &[u8]) -> bool { - if let Some((offset, dev)) = self.get_device(addr) { - // OK to unwrap as lock() failing is a serious error condition and should panic. - dev.lock() - .expect("Failed to acquire device lock") - .write(offset, data); - true - } else { - false - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn bus_insert() { - let mut bus = Bus::new(); - let dummy = Arc::new(Mutex::new(BusDevice::Constant(ConstantDevice))); - // Insert len should not be 0. - bus.insert(dummy.clone(), 0x10, 0).unwrap_err(); - bus.insert(dummy.clone(), 0x10, 0x10).unwrap(); - - let result = bus.insert(dummy.clone(), 0x0f, 0x10); - // This overlaps the address space of the existing bus device at 0x10. - assert!(matches!(result, Err(BusError::Overlap)), "{:?}", result); - - // This overlaps the address space of the existing bus device at 0x10. - bus.insert(dummy.clone(), 0x10, 0x10).unwrap_err(); - // This overlaps the address space of the existing bus device at 0x10. - bus.insert(dummy.clone(), 0x10, 0x15).unwrap_err(); - // This overlaps the address space of the existing bus device at 0x10. - bus.insert(dummy.clone(), 0x12, 0x15).unwrap_err(); - // This overlaps the address space of the existing bus device at 0x10. - bus.insert(dummy.clone(), 0x12, 0x01).unwrap_err(); - // This overlaps the address space of the existing bus device at 0x10. - bus.insert(dummy.clone(), 0x0, 0x20).unwrap_err(); - bus.insert(dummy.clone(), 0x20, 0x05).unwrap(); - bus.insert(dummy.clone(), 0x25, 0x05).unwrap(); - bus.insert(dummy, 0x0, 0x10).unwrap(); - } - - #[test] - fn bus_read_write_values() { - let mut bus = Bus::new(); - let dummy = Arc::new(Mutex::new(BusDevice::Constant(ConstantDevice))); - bus.insert(dummy, 0x10, 0x10).unwrap(); - - let mut values = [0, 1, 2, 3]; - assert!(bus.read(0x10, &mut values)); - assert_eq!(values, [0, 1, 2, 3]); - assert!(bus.write(0x10, &values)); - assert!(bus.read(0x15, &mut values)); - assert_eq!(values, [5, 6, 7, 8]); - assert!(bus.write(0x15, &values)); - } - - #[test] - fn busrange_cmp_and_clone() { - assert_eq!(BusRange(0x10, 2), BusRange(0x10, 3)); - assert_eq!(BusRange(0x10, 2), BusRange(0x10, 2)); - - assert!(BusRange(0x10, 2) < BusRange(0x12, 1)); - assert!(BusRange(0x10, 2) < BusRange(0x12, 3)); - - let mut bus = Bus::new(); - let mut data = [1, 2, 3, 4]; - bus.insert( - Arc::new(Mutex::new(BusDevice::Constant(ConstantDevice))), - 0x10, - 0x10, - ) - .unwrap(); - assert!(bus.write(0x10, &data)); - let bus_clone = bus.clone(); - assert!(bus.read(0x10, &mut data)); - assert_eq!(data, [1, 2, 3, 4]); - assert!(bus_clone.read(0x10, &mut data)); - assert_eq!(data, [1, 2, 3, 4]); - } - - #[test] - fn test_display_error() { - assert_eq!( - format!("{}", BusError::Overlap), - "New device overlaps with an old device." - ); - } -} diff --git a/src/vmm/src/devices/legacy/i8042.rs b/src/vmm/src/devices/legacy/i8042.rs index bcf7bdd8c90..1bc830bd13b 100644 --- a/src/vmm/src/devices/legacy/i8042.rs +++ b/src/vmm/src/devices/legacy/i8042.rs @@ -7,6 +7,7 @@ use std::io; use std::num::Wrapping; +use std::sync::{Arc, Barrier}; use log::warn; use serde::Serialize; @@ -96,7 +97,7 @@ pub struct I8042Device { reset_evt: EventFd, /// Keyboard interrupt event (IRQ 1). - kbd_interrupt_evt: EventFd, + pub kbd_interrupt_evt: EventFd, /// The i8042 status register. status: u8, @@ -209,8 +210,8 @@ impl I8042Device { } } -impl I8042Device { - pub fn bus_read(&mut self, offset: u64, data: &mut [u8]) { +impl vm_device::BusDevice for I8042Device { + fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { // All our ports are byte-wide. We don't know how to handle any wider data. if data.len() != 1 { METRICS.missed_read_count.inc(); @@ -245,11 +246,11 @@ impl I8042Device { } } - pub fn bus_write(&mut self, offset: u64, data: &[u8]) { + fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { // All our ports are byte-wide. We don't know how to handle any wider data. if data.len() != 1 { METRICS.missed_write_count.inc(); - return; + return None; } let mut write_ok = true; @@ -335,11 +336,15 @@ impl I8042Device { } else { METRICS.missed_write_count.inc(); } + + None } } #[cfg(test)] mod tests { + use vm_device::BusDevice; + use super::*; impl PartialEq for I8042Error { @@ -358,9 +363,9 @@ mod tests { // Check if reading in a 2-length array doesn't have side effects. let mut data = [1, 2]; - i8042.bus_read(0, &mut data); + i8042.read(0x0, 0, &mut data); assert_eq!(data, [1, 2]); - i8042.bus_read(1, &mut data); + i8042.read(0x0, 1, &mut data); assert_eq!(data, [1, 2]); // Check if reset works. @@ -368,23 +373,23 @@ mod tests { // counter doesn't change (for 0 it blocks). reset_evt.write(1).unwrap(); let mut data = [CMD_RESET_CPU]; - i8042.bus_write(OFS_STATUS, &data); + i8042.write(0x0, OFS_STATUS, &data); assert_eq!(reset_evt.read().unwrap(), 2); // Check if reading with offset 1 doesn't have side effects. - i8042.bus_read(1, &mut data); + i8042.read(0x0, 1, &mut data); assert_eq!(data[0], CMD_RESET_CPU); // Check invalid `write`s. let before = METRICS.missed_write_count.count(); // offset != 0. - i8042.bus_write(1, &data); + i8042.write(0x0, 1, &data); // data != CMD_RESET_CPU data[0] = CMD_RESET_CPU + 1; - i8042.bus_write(1, &data); + i8042.write(0x0, 1, &data); // data.len() != 1 let data = [CMD_RESET_CPU; 2]; - i8042.bus_write(1, &data); + i8042.write(0x0, 1, &data); assert_eq!(METRICS.missed_write_count.count(), before + 3); } @@ -398,33 +403,33 @@ mod tests { // Test reading/writing the control register. data[0] = CMD_WRITE_CTR; - i8042.bus_write(OFS_STATUS, &data); + i8042.write(0x0, OFS_STATUS, &data); assert_ne!(i8042.status & SB_I8042_CMD_DATA, 0); data[0] = 0x52; - i8042.bus_write(OFS_DATA, &data); + i8042.write(0x0, OFS_DATA, &data); data[0] = CMD_READ_CTR; - i8042.bus_write(OFS_STATUS, &data); + i8042.write(0x0, OFS_STATUS, &data); assert_ne!(i8042.status & SB_OUT_DATA_AVAIL, 0); - i8042.bus_read(OFS_DATA, &mut data); + i8042.read(0x0, OFS_DATA, &mut data); assert_eq!(data[0], 0x52); // Test reading/writing the output port. data[0] = CMD_WRITE_OUTP; - i8042.bus_write(OFS_STATUS, &data); + i8042.write(0x0, OFS_STATUS, &data); assert_ne!(i8042.status & SB_I8042_CMD_DATA, 0); data[0] = 0x52; - i8042.bus_write(OFS_DATA, &data); + i8042.write(0x0, OFS_DATA, &data); data[0] = CMD_READ_OUTP; - i8042.bus_write(OFS_STATUS, &data); + i8042.write(0x0, OFS_STATUS, &data); assert_ne!(i8042.status & SB_OUT_DATA_AVAIL, 0); - i8042.bus_read(OFS_DATA, &mut data); + i8042.read(0x0, OFS_DATA, &mut data); assert_eq!(data[0], 0x52); // Test kbd commands. data[0] = 0x52; - i8042.bus_write(OFS_DATA, &data); + i8042.write(0x0, OFS_DATA, &data); assert_ne!(i8042.status & SB_OUT_DATA_AVAIL, 0); - i8042.bus_read(OFS_DATA, &mut data); + i8042.read(0x0, OFS_DATA, &mut data); assert_eq!(data[0], 0xFA); } @@ -470,13 +475,13 @@ mod tests { assert!(i8042.kbd_interrupt_evt.read().unwrap() > 1); // The "data available" flag should be on. - i8042.bus_read(OFS_STATUS, &mut data); + i8042.read(0x0, OFS_STATUS, &mut data); let mut key_byte: u8; if key & 0xFF00 != 0 { // For extended keys, we should be able to read the MSB first. key_byte = ((key & 0xFF00) >> 8) as u8; - i8042.bus_read(OFS_DATA, &mut data); + i8042.read(0x0, OFS_DATA, &mut data); assert_eq!(data[0], key_byte); // And then do the same for the LSB. @@ -485,10 +490,10 @@ mod tests { i8042.trigger_kbd_interrupt().unwrap(); assert!(i8042.kbd_interrupt_evt.read().unwrap() > 1); // The "data available" flag should be on. - i8042.bus_read(OFS_STATUS, &mut data); + i8042.read(0x0, OFS_STATUS, &mut data); } key_byte = (key & 0xFF) as u8; - i8042.bus_read(OFS_DATA, &mut data); + i8042.read(0x0, OFS_DATA, &mut data); assert_eq!(data[0], key_byte); } @@ -530,9 +535,9 @@ mod tests { // Test kbd interrupt disable. let mut data = [1]; data[0] = CMD_WRITE_CTR; - i8042.bus_write(OFS_STATUS, &data); + i8042.write(0x0, OFS_STATUS, &data); data[0] = i8042.control & !CB_KBD_INT; - i8042.bus_write(OFS_DATA, &data); + i8042.write(0x0, OFS_DATA, &data); i8042.trigger_key(KEY_CTRL).unwrap(); assert_eq!( i8042.trigger_kbd_interrupt().unwrap_err(), diff --git a/src/vmm/src/devices/legacy/serial.rs b/src/vmm/src/devices/legacy/serial.rs index b895635e56b..afc47189c1e 100644 --- a/src/vmm/src/devices/legacy/serial.rs +++ b/src/vmm/src/devices/legacy/serial.rs @@ -9,6 +9,7 @@ use std::fmt::Debug; use std::io::{self, Read, Stdin, Write}; use std::os::unix::io::{AsRawFd, RawFd}; +use std::sync::{Arc, Barrier}; use event_manager::{EventOps, Events, MutEventSubscriber}; use libc::EFD_NONBLOCK; @@ -358,10 +359,11 @@ fn is_fifo(fd: RawFd) -> bool { (stat.st_mode & libc::S_IFIFO) != 0 } -impl - SerialWrapper +impl vm_device::BusDevice for SerialWrapper +where + I: Read + AsRawFd + Send, { - pub fn bus_read(&mut self, offset: u64, data: &mut [u8]) { + fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { if let (Ok(offset), 1) = (u8::try_from(offset), data.len()) { data[0] = self.serial.read(offset); } else { @@ -369,7 +371,7 @@ impl } } - pub fn bus_write(&mut self, offset: u64, data: &[u8]) { + fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { if let (Ok(offset), 1) = (u8::try_from(offset), data.len()) { if let Err(err) = self.serial.write(offset, data[0]) { // Counter incremented for any handle_write() error. @@ -379,24 +381,6 @@ impl } else { METRICS.missed_write_count.inc(); } - } -} - -#[cfg(target_arch = "aarch64")] -impl vm_device::BusDevice - for SerialWrapper -{ - fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { - self.bus_read(offset, data) - } - - fn write( - &mut self, - _base: u64, - offset: u64, - data: &[u8], - ) -> Option> { - self.bus_write(offset, data); None } } @@ -405,6 +389,7 @@ impl vm_device::BusDevice mod tests { #![allow(clippy::undocumented_unsafe_blocks)] + use vm_device::BusDevice; use vmm_sys_util::eventfd::EventFd; use super::*; @@ -430,13 +415,13 @@ mod tests { let invalid_reads_before = metrics.missed_read_count.count(); let mut v = [0x00; 2]; - serial.bus_read(0u64, &mut v); + serial.read(0x0, 0u64, &mut v); let invalid_reads_after = metrics.missed_read_count.count(); assert_eq!(invalid_reads_before + 1, invalid_reads_after); let mut v = [0x00; 1]; - serial.bus_read(0u64, &mut v); + serial.read(0x0, 0u64, &mut v); assert_eq!(v[0], b'a'); let invalid_reads_after_2 = metrics.missed_read_count.count(); diff --git a/src/vmm/src/devices/mod.rs b/src/vmm/src/devices/mod.rs index 495e1507edd..dd58acc9337 100644 --- a/src/vmm/src/devices/mod.rs +++ b/src/vmm/src/devices/mod.rs @@ -10,12 +10,10 @@ use std::io; pub mod acpi; -pub mod bus; pub mod legacy; pub mod pseudo; pub mod virtio; -pub use bus::{Bus, BusDevice, BusError}; use log::error; use crate::devices::virtio::net::metrics::NetDeviceMetrics; diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 964d6ab67cc..1230b337d35 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -467,12 +467,11 @@ impl Vmm { #[cfg(target_arch = "x86_64")] { - let mut guard = self + let mut serial = self .pio_device_manager .stdio_serial .lock() .expect("Poisoned lock"); - let serial = guard.serial_mut().unwrap(); serial .serial @@ -489,8 +488,6 @@ impl Vmm { .i8042 .lock() .expect("i8042 lock was poisoned") - .i8042_device_mut() - .unwrap() .trigger_ctrl_alt_del() .map_err(VmmError::I8042Error) } diff --git a/src/vmm/src/vstate/vcpu.rs b/src/vmm/src/vstate/vcpu.rs index c578d98fdb3..642b2fd2352 100644 --- a/src/vmm/src/vstate/vcpu.rs +++ b/src/vmm/src/vstate/vcpu.rs @@ -718,12 +718,12 @@ pub(crate) mod tests { use std::sync::{Arc, Barrier, Mutex}; use linux_loader::loader::KernelLoader; + use vm_device::BusDevice; use vmm_sys_util::errno; use super::*; use crate::RECV_TIMEOUT_SEC; use crate::arch::{BootProtocol, EntryPoint}; - use crate::devices::bus::DummyDevice; use crate::seccomp::get_empty_filters; use crate::utils::mib_to_bytes; use crate::utils::signal::validate_signal_num; @@ -733,6 +733,16 @@ pub(crate) mod tests { use crate::vstate::vm::Vm; use crate::vstate::vm::tests::setup_vm_with_memory; + struct DummyDevice; + + impl BusDevice for DummyDevice { + fn read(&mut self, _base: u64, _offset: u64, _data: &mut [u8]) {} + + fn write(&mut self, _base: u64, _offset: u64, _data: &[u8]) -> Option> { + None + } + } + #[test] fn test_handle_kvm_exit() { let (_, _, mut vcpu) = setup_vcpu(0x1000); diff --git a/src/vmm/tests/devices.rs b/src/vmm/tests/devices.rs index 62dd4d30aa7..a1ddf124cf7 100644 --- a/src/vmm/tests/devices.rs +++ b/src/vmm/tests/devices.rs @@ -12,6 +12,7 @@ use std::sync::{Arc, Mutex}; use event_manager::{EventManager, SubscriberOps}; use libc::EFD_NONBLOCK; +use vm_device::BusDevice; use vm_superio::Serial; use vmm::devices::legacy::serial::SerialOut; use vmm::devices::legacy::{EventFdTrigger, SerialEventsWrapper, SerialWrapper}; @@ -95,7 +96,7 @@ fn test_issue_serial_hangup_anon_pipe_while_registered_stdin() { serial .lock() .unwrap() - .bus_read(data_bus_offset, &mut data[i..=i]); + .read(0x0, data_bus_offset, &mut data[i..=i]); } assert!(data[..31] == dummy_data[..31]); @@ -142,7 +143,7 @@ fn test_issue_serial_hangup_anon_pipe_while_registered_stdin() { serial .lock() .unwrap() - .bus_read(data_bus_offset, &mut data[i..=i]); + .read(0x0, data_bus_offset, &mut data[i..=i]); } // Process the kick stdin event generated by the reading of the 64th byte of the serial FIFO. @@ -156,7 +157,7 @@ fn test_issue_serial_hangup_anon_pipe_while_registered_stdin() { serial .lock() .unwrap() - .bus_read(data_bus_offset, &mut data[i..=i]); + .read(0x0, data_bus_offset, &mut data[i..=i]); } // We try to read again, but we detect that stdin received previously EOF. @@ -243,7 +244,7 @@ fn test_issue_serial_hangup_anon_pipe_while_unregistered_stdin() { serial .lock() .unwrap() - .bus_read(data_bus_offset, &mut data[i..=i]); + .read(0x0, data_bus_offset, &mut data[i..=i]); } assert!(data[..31] == dummy_data[..31]); @@ -293,7 +294,7 @@ fn test_issue_serial_hangup_anon_pipe_while_unregistered_stdin() { serial .lock() .unwrap() - .bus_read(data_bus_offset, &mut data[i..=i]); + .read(0x0, data_bus_offset, &mut data[i..=i]); } // Process the kick stdin event generated by the reading of the 64th byte of the serial FIFO. @@ -309,7 +310,7 @@ fn test_issue_serial_hangup_anon_pipe_while_unregistered_stdin() { serial .lock() .unwrap() - .bus_read(data_bus_offset, &mut data[i..=i]); + .read(0x0, data_bus_offset, &mut data[i..=i]); } // We try to read again, but we detect that stdin received previously EOF. diff --git a/src/vmm/tests/integration_tests.rs b/src/vmm/tests/integration_tests.rs index 7ef68468709..2c25f3f17c3 100644 --- a/src/vmm/tests/integration_tests.rs +++ b/src/vmm/tests/integration_tests.rs @@ -212,7 +212,7 @@ fn verify_create_snapshot(is_diff: bool) -> (TempFile, TempFile) { // Check that we can deserialize the microVM state from `snapshot_file`. let snapshot_path = snapshot_file.as_path().to_path_buf(); let snapshot_file_metadata = std::fs::metadata(snapshot_path).unwrap(); - let snapshot_len = snapshot_file_metadata.len() as usize; + let snapshot_len = snapshot_file_metadata.len().try_into().unwrap(); let (restored_microvm_state, _) = Snapshot::load::<_, MicrovmState>(&mut snapshot_file.as_file(), snapshot_len).unwrap(); From 6755ed8ac56479166c6c4346f41445d8a823dce8 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Fri, 25 Apr 2025 16:52:48 +0200 Subject: [PATCH 11/99] refactor: add top-level device manager PCIe spec mandates that software can access the configuration space of PCIe devices both via MMIO and Port IO accesses. As a result, PCIe devices will need to register to both buses (on x86). Change the organization of devices, so that MMIO and PIO device managers do not own the buses. Instead, introduce a DeviceManager object which holds the buses, the resource allocator and includes also all types of device managers (at the moment MMIO, PIO and ACPI). Signed-off-by: Babis Chalios --- src/vmm/src/acpi/mod.rs | 109 +++-- src/vmm/src/arch/aarch64/fdt.rs | 106 ++-- src/vmm/src/arch/aarch64/mod.rs | 5 +- src/vmm/src/arch/x86_64/mod.rs | 10 +- src/vmm/src/builder.rs | 355 ++++---------- src/vmm/src/device_manager/legacy.rs | 22 +- src/vmm/src/device_manager/mmio.rs | 140 +++--- src/vmm/src/device_manager/mod.rs | 451 ++++++++++++++++++ src/vmm/src/device_manager/persist.rs | 54 ++- src/vmm/src/lib.rs | 101 +--- src/vmm/src/persist.rs | 29 +- src/vmm/tests/integration_tests.rs | 26 +- .../integration_tests/functional/test_api.py | 4 +- 13 files changed, 857 insertions(+), 555 deletions(-) diff --git a/src/vmm/src/acpi/mod.rs b/src/vmm/src/acpi/mod.rs index 0b5c5edcbde..542e53409b7 100644 --- a/src/vmm/src/acpi/mod.rs +++ b/src/vmm/src/acpi/mod.rs @@ -10,8 +10,7 @@ use crate::Vcpu; use crate::acpi::x86_64::{ apic_addr, rsdp_addr, setup_arch_dsdt, setup_arch_fadt, setup_interrupt_controllers, }; -use crate::device_manager::acpi::ACPIDeviceManager; -use crate::device_manager::mmio::MMIODeviceManager; +use crate::device_manager::DeviceManager; use crate::device_manager::resources::ResourceAllocator; use crate::vstate::memory::{GuestAddress, GuestMemoryMmap}; @@ -45,7 +44,6 @@ pub enum AcpiError { /// allocator for allocating space for the tables struct AcpiTableWriter<'a> { mem: &'a GuestMemoryMmap, - resource_allocator: &'a mut ResourceAllocator, } impl AcpiTableWriter<'_> { @@ -53,11 +51,15 @@ impl AcpiTableWriter<'_> { /// /// This will allocate enough space inside guest memory and write the table in the allocated /// buffer. It returns the address in which it wrote the table. - fn write_acpi_table(&mut self, table: &mut S) -> Result + fn write_acpi_table( + &mut self, + resource_allocator: &mut ResourceAllocator, + table: &mut S, + ) -> Result where S: Sdt, { - let addr = self.resource_allocator.allocate_system_memory( + let addr = resource_allocator.allocate_system_memory( table.len().try_into().unwrap(), 1, AllocPolicy::FirstMatch, @@ -77,30 +79,32 @@ impl AcpiTableWriter<'_> { } /// Build the DSDT table for the guest - fn build_dsdt( - &mut self, - mmio_device_manager: &MMIODeviceManager, - acpi_device_manager: &ACPIDeviceManager, - ) -> Result { + fn build_dsdt(&mut self, device_manager: &mut DeviceManager) -> Result { let mut dsdt_data = Vec::new(); // Virtio-devices DSDT data - dsdt_data.extend_from_slice(&mmio_device_manager.dsdt_data); + dsdt_data.extend_from_slice(&device_manager.mmio_devices.dsdt_data); // Add GED and VMGenID AML data. - acpi_device_manager.append_aml_bytes(&mut dsdt_data)?; + device_manager + .acpi_devices + .append_aml_bytes(&mut dsdt_data)?; // Architecture specific DSDT data setup_arch_dsdt(&mut dsdt_data)?; let mut dsdt = Dsdt::new(OEM_ID, *b"FCVMDSDT", OEM_REVISION, dsdt_data); - self.write_acpi_table(&mut dsdt) + self.write_acpi_table(&mut device_manager.resource_allocator, &mut dsdt) } /// Build the FADT table for the guest /// /// This includes a pointer with the location of the DSDT in guest memory - fn build_fadt(&mut self, dsdt_addr: u64) -> Result { + fn build_fadt( + &mut self, + resource_allocator: &mut ResourceAllocator, + dsdt_addr: u64, + ) -> Result { let mut fadt = Fadt::new(OEM_ID, *b"FCVMFADT", OEM_REVISION); fadt.set_hypervisor_vendor_id(HYPERVISOR_VENDOR_ID); fadt.set_x_dsdt(dsdt_addr); @@ -108,13 +112,17 @@ impl AcpiTableWriter<'_> { (1 << FADT_F_HW_REDUCED_ACPI) | (1 << FADT_F_PWR_BUTTON) | (1 << FADT_F_SLP_BUTTON), ); setup_arch_fadt(&mut fadt); - self.write_acpi_table(&mut fadt) + self.write_acpi_table(resource_allocator, &mut fadt) } /// Build the MADT table for the guest /// /// This includes information about the interrupt controllers supported in the platform - fn build_madt(&mut self, nr_vcpus: u8) -> Result { + fn build_madt( + &mut self, + resource_allocator: &mut ResourceAllocator, + nr_vcpus: u8, + ) -> Result { let mut madt = Madt::new( OEM_ID, *b"FCVMMADT", @@ -122,20 +130,25 @@ impl AcpiTableWriter<'_> { apic_addr(), setup_interrupt_controllers(nr_vcpus), ); - self.write_acpi_table(&mut madt) + self.write_acpi_table(resource_allocator, &mut madt) } /// Build the XSDT table for the guest /// /// Currently, we pass to the guest just FADT and MADT tables. - fn build_xsdt(&mut self, fadt_addr: u64, madt_addr: u64) -> Result { + fn build_xsdt( + &mut self, + resource_allocator: &mut ResourceAllocator, + fadt_addr: u64, + madt_addr: u64, + ) -> Result { let mut xsdt = Xsdt::new( OEM_ID, *b"FCMVXSDT", OEM_REVISION, vec![fadt_addr, madt_addr], ); - self.write_acpi_table(&mut xsdt) + self.write_acpi_table(resource_allocator, &mut xsdt) } /// Build the RSDP pointer for the guest. @@ -163,20 +176,19 @@ impl AcpiTableWriter<'_> { /// such as interrupt controllers, vCPUs and VirtIO devices. pub(crate) fn create_acpi_tables( mem: &GuestMemoryMmap, - resource_allocator: &mut ResourceAllocator, - mmio_device_manager: &MMIODeviceManager, - acpi_device_manager: &ACPIDeviceManager, + device_manager: &mut DeviceManager, vcpus: &[Vcpu], ) -> Result<(), AcpiError> { - let mut writer = AcpiTableWriter { - mem, - resource_allocator, - }; - - let dsdt_addr = writer.build_dsdt(mmio_device_manager, acpi_device_manager)?; - let fadt_addr = writer.build_fadt(dsdt_addr)?; - let madt_addr = writer.build_madt(vcpus.len().try_into().unwrap())?; - let xsdt_addr = writer.build_xsdt(fadt_addr, madt_addr)?; + let mut writer = AcpiTableWriter { mem }; + + let dsdt_addr = writer.build_dsdt(device_manager)?; + let fadt_addr = writer.build_fadt(&mut device_manager.resource_allocator, dsdt_addr)?; + let madt_addr = writer.build_madt( + &mut device_manager.resource_allocator, + vcpus.len().try_into().unwrap(), + )?; + let xsdt_addr = + writer.build_xsdt(&mut device_manager.resource_allocator, fadt_addr, madt_addr)?; writer.build_rsdp(xsdt_addr) } @@ -218,17 +230,20 @@ mod tests { let mut vmm = default_vmm(); let mut writer = AcpiTableWriter { mem: vmm.vm.guest_memory(), - resource_allocator: &mut vmm.resource_allocator, }; // This should succeed let mut sdt = MockSdt(vec![0; 4096]); - let addr = writer.write_acpi_table(&mut sdt).unwrap(); + let addr = writer + .write_acpi_table(&mut vmm.device_manager.resource_allocator, &mut sdt) + .unwrap(); assert_eq!(addr, SYSTEM_MEM_START); // Let's try to write two 4K pages plus one byte let mut sdt = MockSdt(vec![0; usize::try_from(SYSTEM_MEM_SIZE + 1).unwrap()]); - let err = writer.write_acpi_table(&mut sdt).unwrap_err(); + let err = writer + .write_acpi_table(&mut vmm.device_manager.resource_allocator, &mut sdt) + .unwrap_err(); assert!( matches!( err, @@ -241,19 +256,29 @@ mod tests { // We are allocating memory for tables with alignment of 1 byte. All of these should // succeed. let mut sdt = MockSdt(vec![0; 5]); - let addr = writer.write_acpi_table(&mut sdt).unwrap(); + let addr = writer + .write_acpi_table(&mut vmm.device_manager.resource_allocator, &mut sdt) + .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4096); let mut sdt = MockSdt(vec![0; 2]); - let addr = writer.write_acpi_table(&mut sdt).unwrap(); + let addr = writer + .write_acpi_table(&mut vmm.device_manager.resource_allocator, &mut sdt) + .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4101); let mut sdt = MockSdt(vec![0; 4]); - let addr = writer.write_acpi_table(&mut sdt).unwrap(); + let addr = writer + .write_acpi_table(&mut vmm.device_manager.resource_allocator, &mut sdt) + .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4103); let mut sdt = MockSdt(vec![0; 8]); - let addr = writer.write_acpi_table(&mut sdt).unwrap(); + let addr = writer + .write_acpi_table(&mut vmm.device_manager.resource_allocator, &mut sdt) + .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4107); let mut sdt = MockSdt(vec![0; 16]); - let addr = writer.write_acpi_table(&mut sdt).unwrap(); + let addr = writer + .write_acpi_table(&mut vmm.device_manager.resource_allocator, &mut sdt) + .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4115); } @@ -268,11 +293,13 @@ mod tests { let (_, vm) = setup_vm_with_memory(u64_to_usize(SYSTEM_MEM_START + SYSTEM_MEM_SIZE - 4096)); let mut writer = AcpiTableWriter { mem: vm.guest_memory(), - resource_allocator: &mut ResourceAllocator::new().unwrap(), }; + let mut resource_allocator = ResourceAllocator::new().unwrap(); let mut sdt = MockSdt(vec![0; usize::try_from(SYSTEM_MEM_SIZE).unwrap()]); - let err = writer.write_acpi_table(&mut sdt).unwrap_err(); + let err = writer + .write_acpi_table(&mut resource_allocator, &mut sdt) + .unwrap_err(); assert!( matches!( err, diff --git a/src/vmm/src/arch/aarch64/fdt.rs b/src/vmm/src/arch/aarch64/fdt.rs index 359f47c7044..131be4b2e31 100644 --- a/src/vmm/src/arch/aarch64/fdt.rs +++ b/src/vmm/src/arch/aarch64/fdt.rs @@ -13,6 +13,7 @@ use vm_memory::GuestMemoryError; use super::cache_info::{CacheEntry, read_cache_config}; use super::gic::GICDevice; +use crate::device_manager::DeviceManager; use crate::device_manager::mmio::MMIODeviceInfo; use crate::devices::acpi::vmgenid::{VMGENID_MEM_SIZE, VmGenId}; use crate::initrd::InitrdConfig; @@ -59,11 +60,8 @@ pub fn create_fdt( guest_mem: &GuestMemoryMmap, vcpu_mpidr: Vec, cmdline: CString, - virtio_devices: Vec<&MMIODeviceInfo>, - rtc: Option<&MMIODeviceInfo>, - serial: Option<&MMIODeviceInfo>, + device_manager: &DeviceManager, gic_device: &GICDevice, - vmgenid: &Option, initrd: &Option, ) -> Result, FdtError> { // Allocate stuff necessary for storing the blob. @@ -90,8 +88,8 @@ pub fn create_fdt( create_timer_node(&mut fdt_writer)?; create_clock_node(&mut fdt_writer)?; create_psci_node(&mut fdt_writer)?; - create_devices_node(&mut fdt_writer, virtio_devices, rtc, serial)?; - create_vmgenid_node(&mut fdt_writer, vmgenid)?; + create_devices_node(&mut fdt_writer, device_manager)?; + create_vmgenid_node(&mut fdt_writer, &device_manager.acpi_devices.vmgenid)?; // End Header node. fdt_writer.end_node(root)?; @@ -412,21 +410,21 @@ fn create_rtc_node(fdt: &mut FdtWriter, dev_info: &MMIODeviceInfo) -> Result<(), fn create_devices_node( fdt: &mut FdtWriter, - mut virtio_devices: Vec<&MMIODeviceInfo>, - rtc: Option<&MMIODeviceInfo>, - serial: Option<&MMIODeviceInfo>, + device_manager: &DeviceManager, ) -> Result<(), FdtError> { - if let Some(device_info) = rtc { - create_rtc_node(fdt, device_info)?; + if let Some(rtc_info) = device_manager.mmio_devices.rtc_device_info() { + create_rtc_node(fdt, rtc_info)?; } - if let Some(device_info) = serial { - create_serial_node(fdt, device_info)?; + if let Some(serial_info) = device_manager.mmio_devices.serial_device_info() { + create_serial_node(fdt, serial_info)?; } + let mut virtio_mmio = device_manager.mmio_devices.virtio_device_info(); + // Sort out virtio devices by address from low to high and insert them into fdt table. - virtio_devices.sort_by_key(|a| a.addr); - for ordered_device_info in virtio_devices.drain(..) { + virtio_mmio.sort_by_key(|a| a.addr); + for ordered_device_info in virtio_mmio.drain(..) { create_virtio_node(fdt, ordered_device_info)?; } @@ -436,18 +434,20 @@ fn create_devices_node( #[cfg(test)] mod tests { use std::ffi::CString; + use std::sync::{Arc, Mutex}; use kvm_ioctls::Kvm; + use linux_loader::cmdline as kernel_cmdline; use super::*; + use crate::EventManager; use crate::arch::aarch64::gic::create_gic; use crate::arch::aarch64::layout; - use crate::device_manager::resources::ResourceAllocator; + use crate::device_manager::mmio::tests::DummyDevice; + use crate::device_manager::tests::default_device_manager; use crate::test_utils::arch_mem; use crate::vstate::memory::GuestAddress; - const LEN: u64 = 4096; - // The `load` function from the `device_tree` will mistakenly check the actual size // of the buffer with the allocated size. This works around that. fn set_size(buf: &mut [u8], pos: usize, val: u32) { @@ -460,36 +460,37 @@ mod tests { #[test] fn test_create_fdt_with_devices() { let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); - - let serial = MMIODeviceInfo { - addr: 0x00, - irq: Some(1u32), - len: LEN, - }; - let virtio_device = MMIODeviceInfo { - addr: LEN, - irq: Some(2u32), - len: LEN, - }; - let rtc = MMIODeviceInfo { - addr: 2 * LEN, - irq: Some(3u32), - len: LEN, - }; - + let mut event_manager = EventManager::new().unwrap(); + let mut device_manager = default_device_manager(); let kvm = Kvm::new().unwrap(); let vm = kvm.create_vm().unwrap(); let gic = create_gic(&vm, 1, None).unwrap(); + let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); + cmdline.insert("console", "/dev/tty0").unwrap(); + + device_manager + .attach_legacy_devices_aarch64(&vm, &mut event_manager, &mut cmdline) + .unwrap(); + let dummy = Arc::new(Mutex::new(DummyDevice::new())); + device_manager + .mmio_devices + .register_virtio_test_device( + &vm, + mem.clone(), + &mut device_manager.resource_allocator, + dummy, + &mut cmdline, + "dummy", + ) + .unwrap(); + create_fdt( &mem, vec![0], - CString::new("console=tty0").unwrap(), - vec![&virtio_device], - Some(&rtc), - Some(&serial), + cmdline.as_cstring().unwrap(), + &device_manager, &gic, &None, - &None, ) .unwrap(); } @@ -497,20 +498,21 @@ mod tests { #[test] fn test_create_fdt_with_vmgenid() { let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); - let mut resource_allocator = ResourceAllocator::new().unwrap(); - let vmgenid = VmGenId::new(&mem, &mut resource_allocator).unwrap(); + let mut device_manager = default_device_manager(); let kvm = Kvm::new().unwrap(); let vm = kvm.create_vm().unwrap(); let gic = create_gic(&vm, 1, None).unwrap(); + let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); + cmdline.insert("console", "/dev/tty0").unwrap(); + + device_manager.attach_vmgenid_device(&mem, &vm).unwrap(); + create_fdt( &mem, vec![0], CString::new("console=tty0").unwrap(), - Vec::new(), - None, - None, + &device_manager, &gic, - &Some(vmgenid), &None, ) .unwrap(); @@ -519,6 +521,7 @@ mod tests { #[test] fn test_create_fdt() { let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); + let device_manager = default_device_manager(); let kvm = Kvm::new().unwrap(); let vm = kvm.create_vm().unwrap(); let gic = create_gic(&vm, 1, None).unwrap(); @@ -533,12 +536,9 @@ mod tests { &mem, vec![0], CString::new("console=tty0").unwrap(), - Vec::new(), - None, - None, + &device_manager, &gic, &None, - &None, ) .unwrap(); @@ -578,6 +578,7 @@ mod tests { #[test] fn test_create_fdt_with_initrd() { let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); + let device_manager = default_device_manager(); let kvm = Kvm::new().unwrap(); let vm = kvm.create_vm().unwrap(); let gic = create_gic(&vm, 1, None).unwrap(); @@ -597,11 +598,8 @@ mod tests { &mem, vec![0], CString::new("console=tty0").unwrap(), - vec![], - None, - None, + &device_manager, &gic, - &None, &Some(initrd), ) .unwrap(); diff --git a/src/vmm/src/arch/aarch64/mod.rs b/src/vmm/src/arch/aarch64/mod.rs index f945601c940..6d1d0e26359 100644 --- a/src/vmm/src/arch/aarch64/mod.rs +++ b/src/vmm/src/arch/aarch64/mod.rs @@ -134,11 +134,8 @@ pub fn configure_system_for_boot( vmm.vm.guest_memory(), vcpu_mpidr, cmdline, - vmm.mmio_device_manager.virtio_device_info(), - vmm.mmio_device_manager.rtc_device_info(), - vmm.mmio_device_manager.serial_device_info(), + &vmm.device_manager, vmm.vm.get_irqchip(), - &vmm.acpi_device_manager.vmgenid, initrd, )?; diff --git a/src/vmm/src/arch/x86_64/mod.rs b/src/vmm/src/arch/x86_64/mod.rs index ca350cbf9af..c54ec46c987 100644 --- a/src/vmm/src/arch/x86_64/mod.rs +++ b/src/vmm/src/arch/x86_64/mod.rs @@ -205,7 +205,7 @@ pub fn configure_system_for_boot( // Note that this puts the mptable at the last 1k of Linux's 640k base RAM mptable::setup_mptable( vmm.vm.guest_memory(), - &mut vmm.resource_allocator, + &mut vmm.device_manager.resource_allocator, vcpu_config.vcpu_count, ) .map_err(ConfigurationError::MpTableSetup)?; @@ -226,13 +226,7 @@ pub fn configure_system_for_boot( // Create ACPI tables and write them in guest memory // For the time being we only support ACPI in x86_64 - create_acpi_tables( - vmm.vm.guest_memory(), - &mut vmm.resource_allocator, - &vmm.mmio_device_manager, - &vmm.acpi_device_manager, - vcpus, - )?; + create_acpi_tables(vmm.vm.guest_memory(), &mut vmm.device_manager, vcpus)?; Ok(()) } diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 285c0df0058..aa801b85ed1 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -9,14 +9,12 @@ use std::io; use std::sync::mpsc; use std::sync::{Arc, Mutex}; -use event_manager::{MutEventSubscriber, SubscriberOps}; +use event_manager::SubscriberOps; use linux_loader::cmdline::Cmdline as LoaderKernelCmdline; use userfaultfd::Uffd; use utils::time::TimestampUs; #[cfg(target_arch = "aarch64")] use vm_memory::GuestAddress; -#[cfg(target_arch = "x86_64")] -use vmm_sys_util::eventfd::EventFd; use crate::arch::{ConfigurationError, configure_system_for_boot, load_kernel}; #[cfg(target_arch = "aarch64")] @@ -24,36 +22,24 @@ use crate::construct_kvm_mpidrs; use crate::cpu_config::templates::{ GetCpuTemplate, GetCpuTemplateError, GuestConfigError, KvmCapability, }; -use crate::device_manager::acpi::ACPIDeviceManager; -#[cfg(target_arch = "x86_64")] -use crate::device_manager::legacy::PortIODeviceManager; -use crate::device_manager::mmio::{MMIODeviceManager, MmioError}; -use crate::device_manager::persist::{ - ACPIDeviceManagerConstructorArgs, ACPIDeviceManagerRestoreError, MMIODevManagerConstructorArgs, -}; -use crate::device_manager::resources::ResourceAllocator; -use crate::devices::acpi::vmgenid::{VmGenId, VmGenIdError}; -#[cfg(target_arch = "x86_64")] -use crate::devices::legacy::I8042Device; #[cfg(target_arch = "aarch64")] -use crate::devices::legacy::RTCDevice; -use crate::devices::legacy::SerialDevice; -use crate::devices::legacy::serial::SerialOut; +use crate::device_manager::AttachLegacyMmioDeviceError; +use crate::device_manager::{ + AttachMmioDeviceError, AttachVmgenidError, DeviceManager, DevicePersistError, DeviceRestoreArgs, +}; +use crate::devices::acpi::vmgenid::VmGenIdError; use crate::devices::virtio::balloon::Balloon; use crate::devices::virtio::block::device::Block; -use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::net::Net; use crate::devices::virtio::rng::Entropy; -use crate::devices::virtio::transport::mmio::{IrqTrigger, MmioTransport}; use crate::devices::virtio::vsock::{Vsock, VsockUnixBackend}; #[cfg(feature = "gdb")] use crate::gdb; use crate::initrd::{InitrdConfig, InitrdError}; -use crate::logger::{debug, error}; +use crate::logger::debug; use crate::persist::{MicrovmState, MicrovmStateError}; use crate::resources::VmResources; use crate::seccomp::BpfThreadMap; -use crate::snapshot::Persist; use crate::vmm_config::instance_info::InstanceInfo; use crate::vmm_config::machine_config::MachineConfigError; use crate::vstate::kvm::Kvm; @@ -68,7 +54,10 @@ pub enum StartMicrovmError { /// Unable to attach block device to Vmm: {0} AttachBlockDevice(io::Error), /// Unable to attach the VMGenID device: {0} - AttachVmgenidDevice(kvm_ioctls::Error), + AttachVmgenidDevice(#[from] AttachVmgenidError), + #[cfg(target_arch = "aarch64")] + /// Unable to attach legacy MMIO devices: {0} + AttachLegacyDevices(#[from] AttachLegacyMmioDeviceError), /// System configuration error: {0} ConfigureSystem(#[from] ConfigurationError), /// Failed to create guest config: {0} @@ -108,7 +97,7 @@ pub enum StartMicrovmError { /// Cannot open the block device backing file: {0} OpenBlockDevice(io::Error), /// Cannot initialize a MMIO Device or add a device to the MMIO Bus or cmdline: {0} - RegisterMmioDevice(#[from] device_manager::mmio::MmioError), + RegisterMmioDevice(#[from] device_manager::AttachMmioDeviceError), /// Cannot restore microvm state: {0} RestoreMicrovmState(MicrovmStateError), /// Cannot set vm resources: {0} @@ -145,39 +134,9 @@ fn create_vmm_and_vcpus( // Build custom CPU config if a custom template is provided. let mut vm = Vm::new(&kvm)?; - let resource_allocator = ResourceAllocator::new()?; - - // Instantiate the MMIO device manager. - let mmio_device_manager = MMIODeviceManager::new(); - - // Instantiate ACPI device manager. - let acpi_device_manager = ACPIDeviceManager::new(); - let (vcpus, vcpus_exit_evt) = vm.create_vcpus(vcpu_count)?; - #[cfg(target_arch = "x86_64")] - let pio_device_manager = { - // Make stdout non blocking. - set_stdout_nonblocking(); - - // Serial device setup. - let serial_device = setup_serial_device(event_manager)?; - - // x86_64 uses the i8042 reset event as the Vmm exit event. - let reset_evt = vcpus_exit_evt.try_clone().map_err(VmmError::EventFd)?; - let i8042 = Arc::new(Mutex::new(I8042Device::new( - reset_evt, - EventFd::new(libc::EFD_NONBLOCK).map_err(VmmError::EventFd)?, - ))); - - // create pio dev manager with legacy devices - let mut pio_dev_mgr = - PortIODeviceManager::new(serial_device, i8042).map_err(VmmError::LegacyIOBus)?; - pio_dev_mgr - .register_devices(vm.fd()) - .map_err(VmmError::LegacyIOBus)?; - pio_dev_mgr - }; + let device_manager = DeviceManager::new(event_manager, &vcpus_exit_evt, vm.fd())?; let vmm = Vmm { events_observer: Some(std::io::stdin()), @@ -188,11 +147,7 @@ fn create_vmm_and_vcpus( uffd: None, vcpus_handles: Vec::new(), vcpus_exit_evt, - resource_allocator, - mmio_device_manager, - #[cfg(target_arch = "x86_64")] - pio_device_manager, - acpi_device_manager, + device_manager, }; Ok((vmm, vcpus)) @@ -263,7 +218,7 @@ pub fn build_microvm_for_boot( // to maintain the same MMIO address referenced in the documentation // and tests. if vm_resources.boot_timer { - attach_boot_timer_device(&mut vmm, request_ts)?; + vmm.device_manager.attach_boot_timer_device(request_ts)?; } if let Some(balloon) = vm_resources.balloon.get() { @@ -292,9 +247,14 @@ pub fn build_microvm_for_boot( } #[cfg(target_arch = "aarch64")] - attach_legacy_devices_aarch64(event_manager, &mut vmm, &mut boot_cmdline)?; + vmm.device_manager.attach_legacy_devices_aarch64( + vmm.vm.fd(), + event_manager, + &mut boot_cmdline, + )?; - attach_vmgenid_device(&mut vmm)?; + vmm.device_manager + .attach_vmgenid_device(vmm.vm.guest_memory(), vmm.vm.fd())?; #[cfg(target_arch = "aarch64")] if vcpus[0].kvm_vcpu.supports_pvtime() { @@ -413,10 +373,8 @@ pub enum BuildMicrovmFromSnapshotError { MissingVmmSeccompFilters, /// Failed to apply VMM secccomp filter: {0} SeccompFiltersInternal(#[from] crate::seccomp::InstallationError), - /// Failed to restore ACPI device manager: {0} - ACPIDeviManager(#[from] ACPIDeviceManagerRestoreError), - /// VMGenID update failed: {0} - VMGenIDUpdate(std::io::Error), + /// Failed to restore devices: {0} + RestoreDevices(#[from] DevicePersistError), } /// Builds and starts a microVM based on the provided MicrovmState. @@ -496,38 +454,17 @@ pub fn build_microvm_from_snapshot( vm_resources.boot_source.config = microvm_state.vm_info.boot_source; // Restore devices states. - let mmio_ctor_args = MMIODevManagerConstructorArgs { + let device_ctor_args = DeviceRestoreArgs { mem: vmm.vm.guest_memory(), vm: vmm.vm.fd(), event_manager, - resource_allocator: &mut vmm.resource_allocator, vm_resources, instance_id: &instance_info.id, restored_from_file: vmm.uffd.is_none(), }; - vmm.mmio_device_manager = - MMIODeviceManager::restore(mmio_ctor_args, µvm_state.device_states) - .map_err(MicrovmStateError::RestoreDevices)?; - vmm.emulate_serial_init()?; - - { - let acpi_ctor_args = ACPIDeviceManagerConstructorArgs { - mem: vmm.vm.guest_memory(), - resource_allocator: &mut vmm.resource_allocator, - vm: vmm.vm.fd(), - }; - - vmm.acpi_device_manager = - ACPIDeviceManager::restore(acpi_ctor_args, µvm_state.acpi_dev_state)?; - - // Inject the notification to VMGenID that we have resumed from a snapshot. - // This needs to happen before we resume vCPUs, so that we minimize the time between vCPUs - // resuming and notification being handled by the driver. - vmm.acpi_device_manager - .notify_vmgenid() - .map_err(BuildMicrovmFromSnapshotError::VMGenIDUpdate)?; - } + vmm.device_manager + .restore(µvm_state.device_states, device_ctor_args)?; // Move vcpus to their own threads and start their state machine in the 'Paused' state. vmm.start_vcpus( @@ -553,18 +490,6 @@ pub fn build_microvm_from_snapshot( Ok(vmm) } -/// Sets up the serial device. -pub fn setup_serial_device( - event_manager: &mut EventManager, -) -> Result>, VmmError> { - let serial = Arc::new(Mutex::new( - SerialDevice::new(Some(std::io::stdin()), SerialOut::Stdout(std::io::stdout())) - .map_err(VmmError::EventFd)?, - )); - event_manager.add_subscriber(serial.clone()); - Ok(serial) -} - /// 64 bytes due to alignment requirement in 3.1 of https://www.kernel.org/doc/html/v5.8/virt/kvm/devices/vcpu.html#attribute-kvm-arm-vcpu-pvtime-ipa #[cfg(target_arch = "aarch64")] const STEALTIME_STRUCT_MEM_SIZE: u64 = 64; @@ -578,6 +503,7 @@ fn allocate_pvtime_region( ) -> Result { let size = STEALTIME_STRUCT_MEM_SIZE * vcpu_count as u64; let addr = vmm + .device_manager .resource_allocator .allocate_system_memory(size, STEALTIME_STRUCT_MEM_SIZE, policy) .map_err(StartMicrovmError::AllocateResources)?; @@ -603,110 +529,22 @@ fn setup_pvtime(vmm: &mut Vmm, vcpus: &mut [Vcpu]) -> Result<(), StartMicrovmErr Ok(()) } -#[cfg(target_arch = "aarch64")] -fn attach_legacy_devices_aarch64( - event_manager: &mut EventManager, - vmm: &mut Vmm, - cmdline: &mut LoaderKernelCmdline, -) -> Result<(), VmmError> { - // Serial device setup. - let cmdline_contains_console = cmdline - .as_cstring() - .map_err(|_| VmmError::Cmdline)? - .into_string() - .map_err(|_| VmmError::Cmdline)? - .contains("console="); - - if cmdline_contains_console { - // Make stdout non-blocking. - set_stdout_nonblocking(); - let serial = Arc::new(Mutex::new( - SerialDevice::new(Some(std::io::stdin()), SerialOut::Stdout(std::io::stdout())) - .map_err(VmmError::EventFd)?, - )); - event_manager.add_subscriber(serial.clone()); - vmm.mmio_device_manager - .register_mmio_serial(vmm.vm.fd(), &mut vmm.resource_allocator, serial, None) - .map_err(VmmError::RegisterMMIODevice)?; - vmm.mmio_device_manager - .add_mmio_serial_to_cmdline(cmdline) - .map_err(VmmError::RegisterMMIODevice)?; - } - - let rtc = RTCDevice::new(); - vmm.mmio_device_manager - .register_mmio_rtc(&mut vmm.resource_allocator, rtc, None) - .map_err(VmmError::RegisterMMIODevice) -} - -/// Attaches a VirtioDevice device to the device manager and event manager. -fn attach_virtio_device( - event_manager: &mut EventManager, - vmm: &mut Vmm, - id: String, - device: Arc>, - cmdline: &mut LoaderKernelCmdline, - is_vhost_user: bool, -) -> Result<(), MmioError> { - event_manager.add_subscriber(device.clone()); - - let interrupt = Arc::new(IrqTrigger::new()); - // The device mutex mustn't be locked here otherwise it will deadlock. - let device = MmioTransport::new( - vmm.vm.guest_memory().clone(), - interrupt, - device, - is_vhost_user, - ); - vmm.mmio_device_manager - .register_mmio_virtio_for_boot( - vmm.vm.fd(), - &mut vmm.resource_allocator, - id, - device, - cmdline, - ) - .map(|_| ()) -} - -pub(crate) fn attach_boot_timer_device( - vmm: &mut Vmm, - request_ts: TimestampUs, -) -> Result<(), MmioError> { - let boot_timer = crate::devices::pseudo::BootTimer::new(request_ts); - - vmm.mmio_device_manager - .register_mmio_boot_timer(&mut vmm.resource_allocator, boot_timer)?; - - Ok(()) -} - -fn attach_vmgenid_device(vmm: &mut Vmm) -> Result<(), StartMicrovmError> { - let vmgenid = VmGenId::new(vmm.vm.guest_memory(), &mut vmm.resource_allocator) - .map_err(StartMicrovmError::CreateVMGenID)?; - - vmm.acpi_device_manager - .attach_vmgenid(vmgenid, vmm.vm.fd()) - .map_err(StartMicrovmError::AttachVmgenidDevice)?; - - Ok(()) -} - fn attach_entropy_device( vmm: &mut Vmm, cmdline: &mut LoaderKernelCmdline, entropy_device: &Arc>, event_manager: &mut EventManager, -) -> Result<(), MmioError> { +) -> Result<(), AttachMmioDeviceError> { let id = entropy_device .lock() .expect("Poisoned lock") .id() .to_string(); - attach_virtio_device( - event_manager, - vmm, + event_manager.add_subscriber(entropy_device.clone()); + vmm.device_manager.attach_virtio_device( + vmm.vm.guest_memory(), + vmm.vm.fd(), id, entropy_device.clone(), cmdline, @@ -736,9 +574,10 @@ fn attach_block_devices<'a, I: Iterator>> + Debug>( (locked.id().to_string(), locked.is_vhost_user()) }; // The device mutex mustn't be locked here otherwise it will deadlock. - attach_virtio_device( - event_manager, - vmm, + event_manager.add_subscriber(block.clone()); + vmm.device_manager.attach_virtio_device( + vmm.vm.guest_memory(), + vmm.vm.fd(), id, block.clone(), cmdline, @@ -756,8 +595,16 @@ fn attach_net_devices<'a, I: Iterator>> + Debug>( ) -> Result<(), StartMicrovmError> { for net_device in net_devices { let id = net_device.lock().expect("Poisoned lock").id().clone(); + event_manager.add_subscriber(net_device.clone()); // The device mutex mustn't be locked here otherwise it will deadlock. - attach_virtio_device(event_manager, vmm, id, net_device.clone(), cmdline, false)?; + vmm.device_manager.attach_virtio_device( + vmm.vm.guest_memory(), + vmm.vm.fd(), + id, + net_device.clone(), + cmdline, + false, + )?; } Ok(()) } @@ -767,10 +614,18 @@ fn attach_unixsock_vsock_device( cmdline: &mut LoaderKernelCmdline, unix_vsock: &Arc>>, event_manager: &mut EventManager, -) -> Result<(), MmioError> { +) -> Result<(), AttachMmioDeviceError> { let id = String::from(unix_vsock.lock().expect("Poisoned lock").id()); + event_manager.add_subscriber(unix_vsock.clone()); // The device mutex mustn't be locked here otherwise it will deadlock. - attach_virtio_device(event_manager, vmm, id, unix_vsock.clone(), cmdline, false) + vmm.device_manager.attach_virtio_device( + vmm.vm.guest_memory(), + vmm.vm.fd(), + id, + unix_vsock.clone(), + cmdline, + false, + ) } fn attach_balloon_device( @@ -778,38 +633,28 @@ fn attach_balloon_device( cmdline: &mut LoaderKernelCmdline, balloon: &Arc>, event_manager: &mut EventManager, -) -> Result<(), MmioError> { +) -> Result<(), AttachMmioDeviceError> { let id = String::from(balloon.lock().expect("Poisoned lock").id()); + event_manager.add_subscriber(balloon.clone()); // The device mutex mustn't be locked here otherwise it will deadlock. - attach_virtio_device(event_manager, vmm, id, balloon.clone(), cmdline, false) -} - -// Adds `O_NONBLOCK` to the stdout flags. -pub(crate) fn set_stdout_nonblocking() { - // SAFETY: Call is safe since parameters are valid. - let flags = unsafe { libc::fcntl(libc::STDOUT_FILENO, libc::F_GETFL, 0) }; - if flags < 0 { - error!("Could not get Firecracker stdout flags."); - } - // SAFETY: Call is safe since parameters are valid. - let rc = unsafe { libc::fcntl(libc::STDOUT_FILENO, libc::F_SETFL, flags | libc::O_NONBLOCK) }; - if rc < 0 { - error!("Could not set Firecracker stdout to non-blocking."); - } + vmm.device_manager.attach_virtio_device( + vmm.vm.guest_memory(), + vmm.vm.fd(), + id, + balloon.clone(), + cmdline, + false, + ) } #[cfg(test)] pub(crate) mod tests { use linux_loader::cmdline::Cmdline; - #[cfg(target_arch = "x86_64")] - use vmm_sys_util::eventfd::EventFd; use vmm_sys_util::tempfile::TempFile; use super::*; - use crate::device_manager::resources::ResourceAllocator; - #[cfg(target_arch = "x86_64")] - use crate::devices::legacy::serial::SerialOut; + use crate::device_manager::tests::default_device_manager; use crate::devices::virtio::block::CacheType; use crate::devices::virtio::rng::device::ENTROPY_DEV_ID; use crate::devices::virtio::vsock::{TYPE_VSOCK, VSOCK_DEV_ID}; @@ -882,20 +727,6 @@ pub(crate) mod tests { pub(crate) fn default_vmm() -> Vmm { let (kvm, mut vm) = setup_vm_with_memory(mib_to_bytes(128)); - let mmio_device_manager = MMIODeviceManager::new(); - let acpi_device_manager = ACPIDeviceManager::new(); - #[cfg(target_arch = "x86_64")] - let pio_device_manager = PortIODeviceManager::new( - Arc::new(Mutex::new( - SerialDevice::new(None, SerialOut::Sink(std::io::sink())).unwrap(), - )), - Arc::new(Mutex::new(I8042Device::new( - EventFd::new(libc::EFD_NONBLOCK).unwrap(), - EventFd::new(libc::EFD_NONBLOCK).unwrap(), - ))), - ) - .unwrap(); - let (_, vcpus_exit_evt) = vm.create_vcpus(1).unwrap(); Vmm { @@ -907,11 +738,7 @@ pub(crate) mod tests { uffd: None, vcpus_handles: Vec::new(), vcpus_exit_evt, - resource_allocator: ResourceAllocator::new().unwrap(), - mmio_device_manager, - #[cfg(target_arch = "x86_64")] - pio_device_manager, - acpi_device_manager, + device_manager: default_device_manager(), } } @@ -1007,7 +834,8 @@ pub(crate) mod tests { attach_unixsock_vsock_device(vmm, cmdline, &vsock, event_manager).unwrap(); assert!( - vmm.mmio_device_manager + vmm.device_manager + .mmio_devices .get_virtio_device(TYPE_VSOCK, &vsock_dev_id) .is_some() ); @@ -1025,7 +853,8 @@ pub(crate) mod tests { attach_entropy_device(vmm, cmdline, &entropy, event_manager).unwrap(); assert!( - vmm.mmio_device_manager + vmm.device_manager + .mmio_devices .get_virtio_device(TYPE_RNG, ENTROPY_DEV_ID) .is_some() ); @@ -1033,8 +862,10 @@ pub(crate) mod tests { #[cfg(target_arch = "x86_64")] pub(crate) fn insert_vmgenid_device(vmm: &mut Vmm) { - attach_vmgenid_device(vmm).unwrap(); - assert!(vmm.acpi_device_manager.vmgenid.is_some()); + vmm.device_manager + .attach_vmgenid_device(vmm.vm.guest_memory(), vmm.vm.fd()) + .unwrap(); + assert!(vmm.device_manager.acpi_devices.vmgenid.is_some()); } pub(crate) fn insert_balloon_device( @@ -1050,7 +881,8 @@ pub(crate) mod tests { attach_balloon_device(vmm, cmdline, balloon, event_manager).unwrap(); assert!( - vmm.mmio_device_manager + vmm.device_manager + .mmio_devices .get_virtio_device(TYPE_BALLOON, BALLOON_DEV_ID) .is_some() ); @@ -1101,7 +933,8 @@ pub(crate) mod tests { insert_block_devices(&mut vmm, &mut cmdline, &mut event_manager, block_configs); assert!(cmdline_contains(&cmdline, "root=/dev/vda ro")); assert!( - vmm.mmio_device_manager + vmm.device_manager + .mmio_devices .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); @@ -1122,7 +955,8 @@ pub(crate) mod tests { insert_block_devices(&mut vmm, &mut cmdline, &mut event_manager, block_configs); assert!(cmdline_contains(&cmdline, "root=PARTUUID=0eaa91a0-01 rw")); assert!( - vmm.mmio_device_manager + vmm.device_manager + .mmio_devices .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); @@ -1144,7 +978,8 @@ pub(crate) mod tests { assert!(!cmdline_contains(&cmdline, "root=PARTUUID=")); assert!(!cmdline_contains(&cmdline, "root=/dev/vda")); assert!( - vmm.mmio_device_manager + vmm.device_manager + .mmio_devices .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); @@ -1181,17 +1016,20 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=PARTUUID=0eaa91a0-01 rw")); assert!( - vmm.mmio_device_manager + vmm.device_manager + .mmio_devices .get_virtio_device(TYPE_BLOCK, "root") .is_some() ); assert!( - vmm.mmio_device_manager + vmm.device_manager + .mmio_devices .get_virtio_device(TYPE_BLOCK, "secondary") .is_some() ); assert!( - vmm.mmio_device_manager + vmm.device_manager + .mmio_devices .get_virtio_device(TYPE_BLOCK, "third") .is_some() ); @@ -1220,7 +1058,8 @@ pub(crate) mod tests { insert_block_devices(&mut vmm, &mut cmdline, &mut event_manager, block_configs); assert!(cmdline_contains(&cmdline, "root=/dev/vda rw")); assert!( - vmm.mmio_device_manager + vmm.device_manager + .mmio_devices .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); @@ -1241,7 +1080,8 @@ pub(crate) mod tests { insert_block_devices(&mut vmm, &mut cmdline, &mut event_manager, block_configs); assert!(cmdline_contains(&cmdline, "root=PARTUUID=0eaa91a0-01 ro")); assert!( - vmm.mmio_device_manager + vmm.device_manager + .mmio_devices .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); @@ -1262,7 +1102,8 @@ pub(crate) mod tests { insert_block_devices(&mut vmm, &mut cmdline, &mut event_manager, block_configs); assert!(cmdline_contains(&cmdline, "root=/dev/vda rw")); assert!( - vmm.mmio_device_manager + vmm.device_manager + .mmio_devices .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); @@ -1274,9 +1115,9 @@ pub(crate) mod tests { let mut vmm = default_vmm(); let request_ts = TimestampUs::default(); - let res = attach_boot_timer_device(&mut vmm, request_ts); + let res = vmm.device_manager.attach_boot_timer_device(request_ts); res.unwrap(); - assert!(vmm.mmio_device_manager.boot_timer.is_some()); + assert!(vmm.device_manager.mmio_devices.boot_timer.is_some()); } #[test] diff --git a/src/vmm/src/device_manager/legacy.rs b/src/vmm/src/device_manager/legacy.rs index 0af1ae3348a..a2866f14415 100644 --- a/src/vmm/src/device_manager/legacy.rs +++ b/src/vmm/src/device_manager/legacy.rs @@ -33,7 +33,6 @@ pub enum LegacyDeviceError { /// The `LegacyDeviceManger` should be initialized only by using the constructor. #[derive(Debug)] pub struct PortIODeviceManager { - pub io_bus: Arc, // BusDevice::Serial pub stdio_serial: Arc>, // BusDevice::I8042Device @@ -75,7 +74,6 @@ impl PortIODeviceManager { stdio_serial: Arc>, i8042: Arc>, ) -> Result { - let io_bus = Arc::new(vm_device::Bus::new()); let com_evt_1_3 = stdio_serial .lock() .expect("Poisoned lock") @@ -90,7 +88,6 @@ impl PortIODeviceManager { .try_clone()?; Ok(PortIODeviceManager { - io_bus, stdio_serial, i8042, com_evt_1_3, @@ -100,7 +97,11 @@ impl PortIODeviceManager { } /// Register supported legacy devices. - pub fn register_devices(&mut self, vm_fd: &VmFd) -> Result<(), LegacyDeviceError> { + pub fn register_devices( + &mut self, + io_bus: &vm_device::Bus, + vm_fd: &VmFd, + ) -> Result<(), LegacyDeviceError> { let serial_2_4 = Arc::new(Mutex::new(SerialDevice { serial: Serial::with_events( self.com_evt_2_4.try_clone()?.try_clone()?, @@ -121,27 +122,27 @@ impl PortIODeviceManager { ), input: None, })); - self.io_bus.insert( + io_bus.insert( self.stdio_serial.clone(), Self::SERIAL_PORT_ADDRESSES[0], Self::SERIAL_PORT_SIZE, )?; - self.io_bus.insert( + io_bus.insert( serial_2_4.clone(), Self::SERIAL_PORT_ADDRESSES[1], Self::SERIAL_PORT_SIZE, )?; - self.io_bus.insert( + io_bus.insert( serial_1_3, Self::SERIAL_PORT_ADDRESSES[2], Self::SERIAL_PORT_SIZE, )?; - self.io_bus.insert( + io_bus.insert( serial_2_4, Self::SERIAL_PORT_ADDRESSES[3], Self::SERIAL_PORT_SIZE, )?; - self.io_bus.insert( + io_bus.insert( self.i8042.clone(), Self::I8042_KDB_DATA_REGISTER_ADDRESS, Self::I8042_KDB_DATA_REGISTER_SIZE, @@ -245,6 +246,7 @@ mod tests { #[test] fn test_register_legacy_devices() { let (_, vm) = setup_vm_with_memory(0x1000); + let io_bus = vm_device::Bus::new(); vm.setup_irqchip().unwrap(); let mut ldm = PortIODeviceManager::new( Arc::new(Mutex::new(SerialDevice { @@ -263,6 +265,6 @@ mod tests { ))), ) .unwrap(); - ldm.register_devices(vm.fd()).unwrap(); + ldm.register_devices(&io_bus, vm.fd()).unwrap(); } } diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 55e5dbc402f..153f67639db 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -126,7 +126,6 @@ pub struct MMIODevice { /// Manages the complexities of registering a MMIO device. #[derive(Debug)] pub struct MMIODeviceManager { - pub(crate) bus: Arc, /// VirtIO devices using an MMIO transport layer pub(crate) virtio_devices: HashMap<(u32, String), MMIODevice>, /// Boot timer device @@ -152,7 +151,6 @@ impl MMIODeviceManager { /// Create a new DeviceManager handling mmio devices (virtio net, block). pub fn new() -> MMIODeviceManager { MMIODeviceManager { - bus: Arc::new(vm_device::Bus::new()), virtio_devices: HashMap::new(), boot_timer: None, #[cfg(target_arch = "aarch64")] @@ -193,21 +191,20 @@ impl MMIODeviceManager { &mut self, vm: &VmFd, device_id: String, - mmio_device: MmioTransport, - device_info: &MMIODeviceInfo, + mmio_bus: &vm_device::Bus, + device: MMIODevice, ) -> Result<(), MmioError> { // Our virtio devices are currently hardcoded to use a single IRQ. // Validate that requirement. - let Some(irq) = device_info.irq else { - return Err(MmioError::InvalidIrqConfig); - }; + let irq = device.resources.irq.ok_or(MmioError::InvalidIrqConfig)?; let identifier; { + let mmio_device = device.inner.lock().expect("Poisoned lock"); let locked_device = mmio_device.locked_device(); identifier = (locked_device.device_type(), device_id); for (i, queue_evt) in locked_device.queue_events().iter().enumerate() { let io_addr = IoEventAddress::Mmio( - device_info.addr + u64::from(crate::devices::virtio::NOTIFY_REG_OFFSET), + device.resources.addr + u64::from(crate::devices::virtio::NOTIFY_REG_OFFSET), ); vm.register_ioevent(queue_evt, &io_addr, u32::try_from(i).unwrap()) .map_err(MmioError::RegisterIoEvent)?; @@ -216,16 +213,12 @@ impl MMIODeviceManager { .map_err(MmioError::RegisterIrqFd)?; } - let device = Arc::new(Mutex::new(mmio_device)); - self.bus - .insert(device.clone(), device_info.addr, device_info.len)?; - self.virtio_devices.insert( - identifier, - MMIODevice { - resources: *device_info, - inner: device, - }, - ); + mmio_bus.insert( + device.inner.clone(), + device.resources.addr, + device.resources.len, + )?; + self.virtio_devices.insert(identifier, device); Ok(()) } @@ -258,24 +251,29 @@ impl MMIODeviceManager { vm: &VmFd, resource_allocator: &mut ResourceAllocator, device_id: String, + mmio_bus: &vm_device::Bus, mmio_device: MmioTransport, _cmdline: &mut kernel_cmdline::Cmdline, - ) -> Result { - let device_info = self.allocate_mmio_resources(resource_allocator, 1)?; - self.register_mmio_virtio(vm, device_id, mmio_device, &device_info)?; + ) -> Result<(), MmioError> { + let device = MMIODevice { + resources: self.allocate_mmio_resources(resource_allocator, 1)?, + inner: Arc::new(Mutex::new(mmio_device)), + }; + #[cfg(target_arch = "x86_64")] { - Self::add_virtio_device_to_cmdline(_cmdline, &device_info)?; + Self::add_virtio_device_to_cmdline(_cmdline, &device.resources)?; add_virtio_aml( &mut self.dsdt_data, - device_info.addr, - device_info.len, + device.resources.addr, + device.resources.len, // We are sure that `irqs` has at least one element; allocate_mmio_resources makes // sure of it. - device_info.irq.unwrap(), + device.resources.irq.unwrap(), )?; } - Ok(device_info) + self.register_mmio_virtio(vm, device_id, mmio_bus, device)?; + Ok(()) } #[cfg(target_arch = "aarch64")] @@ -284,6 +282,7 @@ impl MMIODeviceManager { pub fn register_mmio_serial( &mut self, vm: &VmFd, + mmio_bus: &vm_device::Bus, resource_allocator: &mut ResourceAllocator, serial: Arc>, device_info_opt: Option, @@ -302,31 +301,35 @@ impl MMIODeviceManager { ) .map_err(MmioError::RegisterIrqFd)?; - self.bus - .insert(serial.clone(), device_info.addr, device_info.len)?; - self.serial = Some(MMIODevice { + let device = MMIODevice { resources: device_info, inner: serial, - }); + }; + + mmio_bus.insert( + device.inner.clone(), + device.resources.addr, + device.resources.len, + )?; + + self.serial = Some(device); Ok(()) } #[cfg(target_arch = "aarch64")] /// Append the registered early console to the kernel cmdline. + /// + /// This assumes that the device has been registered with the device manager. pub fn add_mmio_serial_to_cmdline( &self, cmdline: &mut kernel_cmdline::Cmdline, ) -> Result<(), MmioError> { - match &self.serial { - Some(device) => { - cmdline.insert( - "earlycon", - &format!("uart,mmio,0x{:08x}", device.resources.addr), - )?; - Ok(()) - } - None => Err(MmioError::DeviceNotFound), - } + let device = self.serial.as_ref().unwrap(); + cmdline.insert( + "earlycon", + &format!("uart,mmio,0x{:08x}", device.resources.addr), + )?; + Ok(()) } #[cfg(target_arch = "aarch64")] @@ -334,11 +337,11 @@ impl MMIODeviceManager { /// given as parameter, otherwise allocate a new MMIO resources for it. pub fn register_mmio_rtc( &mut self, + mmio_bus: &vm_device::Bus, resource_allocator: &mut ResourceAllocator, - rtc: RTCDevice, + rtc: Arc>, device_info_opt: Option, ) -> Result<(), MmioError> { - let device = Arc::new(Mutex::new(rtc)); // Create a new MMIODeviceInfo object on boot path or unwrap the // existing object on restore path. let device_info = if let Some(device_info) = device_info_opt { @@ -347,32 +350,41 @@ impl MMIODeviceManager { self.allocate_mmio_resources(resource_allocator, 1)? }; - self.bus - .insert(device.clone(), device_info.addr, device_info.len)?; - self.rtc = Some(MMIODevice { + let device = MMIODevice { resources: device_info, - inner: device, - }); + inner: rtc, + }; + mmio_bus.insert( + device.inner.clone(), + device.resources.addr, + device.resources.len, + )?; + self.rtc = Some(device); Ok(()) } /// Register a boot timer device. pub fn register_mmio_boot_timer( &mut self, + mmio_bus: &vm_device::Bus, resource_allocator: &mut ResourceAllocator, - boot_timer: BootTimer, + boot_timer: Arc>, ) -> Result<(), MmioError> { // Attach a new boot timer device. let device_info = self.allocate_mmio_resources(resource_allocator, 0)?; - - let device = Arc::new(Mutex::new(boot_timer)); - self.bus - .insert(device.clone(), device_info.addr, device_info.len)?; - self.boot_timer = Some(MMIODevice { + let device = MMIODevice { resources: device_info, - inner: device, - }); + inner: boot_timer, + }; + + mmio_bus.insert( + device.inner.clone(), + device.resources.addr, + device.resources.len, + )?; + self.boot_timer = Some(device); + Ok(()) } @@ -516,7 +528,7 @@ impl MMIODeviceManager { } #[cfg(test)] -mod tests { +pub(crate) mod tests { use std::ops::Deref; use std::sync::Arc; @@ -537,7 +549,7 @@ mod tests { const QUEUE_SIZES: &[u16] = &[64]; impl MMIODeviceManager { - fn register_virtio_test_device( + pub(crate) fn register_virtio_test_device( &mut self, vm: &VmFd, guest_mem: GuestMemoryMmap, @@ -547,15 +559,21 @@ mod tests { dev_id: &str, ) -> Result { let interrupt = Arc::new(IrqTrigger::new()); - let mmio_device = MmioTransport::new(guest_mem, interrupt, device, false); - let device_info = self.register_mmio_virtio_for_boot( + let mmio_bus = vm_device::Bus::new(); + let mmio_device = MmioTransport::new(guest_mem, interrupt, device.clone(), false); + self.register_mmio_virtio_for_boot( vm, resource_allocator, dev_id.to_string(), + &mmio_bus, mmio_device, cmdline, )?; - Ok(device_info.addr) + Ok(self + .get_virtio_device(device.lock().unwrap().device_type(), dev_id) + .unwrap() + .resources + .addr) } #[cfg(target_arch = "x86_64")] @@ -570,7 +588,7 @@ mod tests { #[allow(dead_code)] #[derive(Debug)] - struct DummyDevice { + pub(crate) struct DummyDevice { dummy: u32, queues: Vec, queue_evts: [EventFd; 1], diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index bc16604b645..8aec41ffa11 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -5,6 +5,38 @@ // Use of this source code is governed by a BSD-style license that can be // found in the THIRD-PARTY file. +use std::fmt::Debug; +use std::sync::{Arc, Mutex}; + +use acpi::ACPIDeviceManager; +use event_manager::{MutEventSubscriber, SubscriberOps}; +use kvm_ioctls::VmFd; +#[cfg(target_arch = "x86_64")] +use legacy::{LegacyDeviceError, PortIODeviceManager}; +use linux_loader::loader::Cmdline; +use log::error; +use mmio::{MMIODeviceManager, MmioError}; +use persist::{ACPIDeviceManagerConstructorArgs, MMIODevManagerConstructorArgs}; +use resources::ResourceAllocator; +use serde::{Deserialize, Serialize}; +use utils::time::TimestampUs; +use vmm_sys_util::eventfd::EventFd; + +use crate::devices::acpi::vmgenid::{VmGenId, VmGenIdError}; +#[cfg(target_arch = "x86_64")] +use crate::devices::legacy::I8042Device; +#[cfg(target_arch = "aarch64")] +use crate::devices::legacy::RTCDevice; +use crate::devices::legacy::serial::SerialOut; +use crate::devices::legacy::{IER_RDA_BIT, IER_RDA_OFFSET, SerialDevice}; +use crate::devices::pseudo::BootTimer; +use crate::devices::virtio::device::VirtioDevice; +use crate::devices::virtio::transport::mmio::{IrqTrigger, MmioTransport}; +use crate::resources::VmResources; +use crate::snapshot::Persist; +use crate::vstate::memory::GuestMemoryMmap; +use crate::{EmulateSerialInitError, EventManager}; + /// ACPI device manager. pub mod acpi; /// Legacy Device Manager. @@ -15,3 +47,422 @@ pub mod mmio; pub mod persist; /// Resource manager for devices. pub mod resources; + +#[derive(Debug, thiserror::Error, displaydoc::Display)] +/// Error while creating a new [`DeviceManager`] +pub enum DeviceManagerCreateError { + /// Error with EventFd: {0} + EventFd(#[from] std::io::Error), + #[cfg(target_arch = "x86_64")] + /// Legacy device manager error: {0} + PortIOError(#[from] LegacyDeviceError), + /// Resource allocator error: {0} + ResourceAllocator(#[from] vm_allocator::Error), +} + +#[derive(Debug, thiserror::Error, displaydoc::Display)] +/// Error while attaching a VirtIO device +pub enum AttachMmioDeviceError { + /// MMIO transport error: {0} + MmioTransport(#[from] MmioError), + /// Error inserting device in bus: {0} + Bus(#[from] vm_device::BusError), +} + +#[derive(Debug, thiserror::Error, displaydoc::Display)] +/// Error while attaching the VMGenID device +pub enum AttachVmgenidError { + /// Error creating VMGenID device: {0} + CreateVmGenID(#[from] VmGenIdError), + /// Error while registering VMGenID with KVM: {0} + AttachVmGenID(#[from] kvm_ioctls::Error), +} + +#[cfg(target_arch = "aarch64")] +#[derive(Debug, thiserror::Error, displaydoc::Display)] +/// Error while attaching the VMGenID device +pub enum AttachLegacyMmioDeviceError { + /// Cmdline error + Cmdline, + /// Error creating serial device: {0} + CreateSerial(#[from] std::io::Error), + /// Error registering device: {0} + RegisterMMIODevice(#[from] MmioError), + /// Error inserting device in the Bus: {0} + Bus(#[from] vm_device::BusError), +} + +#[derive(Debug)] +/// A manager of all peripheral devices of Firecracker +pub struct DeviceManager { + /// Allocator for system memory and interrupt numbers + pub resource_allocator: ResourceAllocator, + /// MMIO bus + pub mmio_bus: Arc, + /// MMIO devices + pub mmio_devices: MMIODeviceManager, + #[cfg(target_arch = "x86_64")] + /// Port IO bus + pub pio_bus: Arc, + #[cfg(target_arch = "x86_64")] + /// Legacy devices + pub legacy_devices: PortIODeviceManager, + /// ACPI devices + pub acpi_devices: ACPIDeviceManager, +} + +impl DeviceManager { + // Adds `O_NONBLOCK` to the stdout flags. + fn set_stdout_nonblocking() { + // SAFETY: Call is safe since parameters are valid. + let flags = unsafe { libc::fcntl(libc::STDOUT_FILENO, libc::F_GETFL, 0) }; + if flags < 0 { + error!("Could not get Firecracker stdout flags."); + } + // SAFETY: Call is safe since parameters are valid. + let rc = + unsafe { libc::fcntl(libc::STDOUT_FILENO, libc::F_SETFL, flags | libc::O_NONBLOCK) }; + if rc < 0 { + error!("Could not set Firecracker stdout to non-blocking."); + } + } + + /// Sets up the serial device. + fn setup_serial_device( + event_manager: &mut EventManager, + ) -> Result>, std::io::Error> { + let serial = Arc::new(Mutex::new(SerialDevice::new( + Some(std::io::stdin()), + SerialOut::Stdout(std::io::stdout()), + )?)); + event_manager.add_subscriber(serial.clone()); + Ok(serial) + } + + #[cfg_attr(target_arch = "aarch64", allow(unused))] + pub fn new( + event_manager: &mut EventManager, + vcpu_exit_evt: &EventFd, + vmfd: &VmFd, + ) -> Result { + let mmio_bus = Arc::new(vm_device::Bus::new()); + + #[cfg(target_arch = "x86_64")] + let pio_bus = Arc::new(vm_device::Bus::new()); + #[cfg(target_arch = "x86_64")] + let legacy_devices = { + Self::set_stdout_nonblocking(); + + // Create serial device + let serial = Self::setup_serial_device(event_manager)?; + let reset_evt = vcpu_exit_evt + .try_clone() + .map_err(DeviceManagerCreateError::EventFd)?; + // Create keyboard emulator for reset event + let i8042 = Arc::new(Mutex::new(I8042Device::new( + reset_evt, + EventFd::new(libc::EFD_NONBLOCK).map_err(DeviceManagerCreateError::EventFd)?, + ))); + + // create pio dev manager with legacy devices + let mut legacy_devices = PortIODeviceManager::new(serial, i8042)?; + legacy_devices.register_devices(&pio_bus, vmfd)?; + legacy_devices + }; + + Ok(DeviceManager { + resource_allocator: ResourceAllocator::new()?, + mmio_bus, + mmio_devices: MMIODeviceManager::new(), + #[cfg(target_arch = "x86_64")] + pio_bus, + #[cfg(target_arch = "x86_64")] + legacy_devices, + acpi_devices: ACPIDeviceManager::new(), + }) + } + + /// Attaches a VirtioDevice device to the device manager and event manager. + pub(crate) fn attach_virtio_device( + &mut self, + mem: &GuestMemoryMmap, + vmfd: &VmFd, + id: String, + device: Arc>, + cmdline: &mut Cmdline, + is_vhost_user: bool, + ) -> Result<(), AttachMmioDeviceError> { + let interrupt = Arc::new(IrqTrigger::new()); + // The device mutex mustn't be locked here otherwise it will deadlock. + let device = MmioTransport::new(mem.clone(), interrupt, device, is_vhost_user); + self.mmio_devices.register_mmio_virtio_for_boot( + vmfd, + &mut self.resource_allocator, + id, + &self.mmio_bus, + device, + cmdline, + )?; + + Ok(()) + } + + /// Attaches a [`BootTimer`] to the VM + pub(crate) fn attach_boot_timer_device( + &mut self, + request_ts: TimestampUs, + ) -> Result<(), AttachMmioDeviceError> { + let boot_timer = Arc::new(Mutex::new(BootTimer::new(request_ts))); + + self.mmio_devices.register_mmio_boot_timer( + &self.mmio_bus, + &mut self.resource_allocator, + boot_timer, + )?; + + Ok(()) + } + + pub(crate) fn attach_vmgenid_device( + &mut self, + mem: &GuestMemoryMmap, + vmfd: &VmFd, + ) -> Result<(), AttachVmgenidError> { + let vmgenid = VmGenId::new(mem, &mut self.resource_allocator)?; + self.acpi_devices.attach_vmgenid(vmgenid, vmfd)?; + Ok(()) + } + + #[cfg(target_arch = "aarch64")] + pub(crate) fn attach_legacy_devices_aarch64( + &mut self, + vmfd: &VmFd, + event_manager: &mut EventManager, + cmdline: &mut Cmdline, + ) -> Result<(), AttachLegacyMmioDeviceError> { + // Serial device setup. + let cmdline_contains_console = cmdline + .as_cstring() + .map_err(|_| AttachLegacyMmioDeviceError::Cmdline)? + .into_string() + .map_err(|_| AttachLegacyMmioDeviceError::Cmdline)? + .contains("console="); + + if cmdline_contains_console { + // Make stdout non-blocking. + Self::set_stdout_nonblocking(); + let serial = Self::setup_serial_device(event_manager)?; + self.mmio_devices.register_mmio_serial( + vmfd, + &self.mmio_bus, + &mut self.resource_allocator, + serial, + None, + )?; + self.mmio_devices.add_mmio_serial_to_cmdline(cmdline)?; + } + + let rtc = Arc::new(Mutex::new(RTCDevice::new())); + self.mmio_devices.register_mmio_rtc( + &self.mmio_bus, + &mut self.resource_allocator, + rtc, + None, + )?; + Ok(()) + } +} + +#[derive(Debug, Default, Clone, Serialize, Deserialize)] +/// State of devices in the system +pub struct DevicesState { + /// MMIO devices state + pub mmio_state: persist::DeviceStates, + /// ACPI devices state + pub acpi_state: persist::ACPIDeviceManagerState, +} + +#[derive(Debug, thiserror::Error, displaydoc::Display)] +pub enum DevicePersistError { + /// Error restoring MMIO devices: {0} + MmioRestore(#[from] persist::DevicePersistError), + /// Error restoring ACPI devices: {0} + AcpiRestore(#[from] persist::ACPIDeviceManagerRestoreError), + /// Error notifying VMGenID device: {0} + VmGenidUpdate(#[from] std::io::Error), + /// Error resetting serial console: {0} + SerialRestore(#[from] EmulateSerialInitError), + /// Error inserting device in bus: {0} + Bus(#[from] vm_device::BusError), +} + +pub struct DeviceRestoreArgs<'a> { + pub mem: &'a GuestMemoryMmap, + pub vm: &'a VmFd, + pub event_manager: &'a mut EventManager, + pub vm_resources: &'a mut VmResources, + pub instance_id: &'a str, + pub restored_from_file: bool, +} + +impl DeviceManager { + pub fn save(&self) -> DevicesState { + DevicesState { + mmio_state: self.mmio_devices.save(), + acpi_state: self.acpi_devices.save(), + } + } + + /// Sets RDA bit in serial console + pub fn emulate_serial_init(&self) -> Result<(), EmulateSerialInitError> { + // When restoring from a previously saved state, there is no serial + // driver initialization, therefore the RDA (Received Data Available) + // interrupt is not enabled. Because of that, the driver won't get + // notified of any bytes that we send to the guest. The clean solution + // would be to save the whole serial device state when we do the vm + // serialization. For now we set that bit manually + + #[cfg(target_arch = "aarch64")] + { + if let Some(device) = &self.mmio_devices.serial { + let mut device_locked = device.inner.lock().expect("Poisoned lock"); + + device_locked + .serial + .write(IER_RDA_OFFSET, IER_RDA_BIT) + .map_err(|_| EmulateSerialInitError(std::io::Error::last_os_error()))?; + } + Ok(()) + } + + #[cfg(target_arch = "x86_64")] + { + let mut serial = self + .legacy_devices + .stdio_serial + .lock() + .expect("Poisoned lock"); + + serial + .serial + .write(IER_RDA_OFFSET, IER_RDA_BIT) + .map_err(|_| EmulateSerialInitError(std::io::Error::last_os_error()))?; + Ok(()) + } + } + + pub fn restore( + &mut self, + state: &DevicesState, + restore_args: DeviceRestoreArgs, + ) -> Result<(), DevicePersistError> { + // Restore MMIO devices + let mmio_ctor_args = MMIODevManagerConstructorArgs { + mmio_bus: &self.mmio_bus, + mem: restore_args.mem, + vm: restore_args.vm, + event_manager: restore_args.event_manager, + resource_allocator: &mut self.resource_allocator, + vm_resources: restore_args.vm_resources, + instance_id: restore_args.instance_id, + restored_from_file: restore_args.restored_from_file, + }; + self.mmio_devices = MMIODeviceManager::restore(mmio_ctor_args, &state.mmio_state)?; + + // Restore serial. + // We need to do that after we restore mmio devices, otherwise it won't succeed in Aarch64 + self.emulate_serial_init()?; + + // Restore ACPI devices + let acpi_ctor_args = ACPIDeviceManagerConstructorArgs { + mem: restore_args.mem, + resource_allocator: &mut self.resource_allocator, + vm: restore_args.vm, + }; + self.acpi_devices = ACPIDeviceManager::restore(acpi_ctor_args, &state.acpi_state)?; + self.acpi_devices.notify_vmgenid()?; + + Ok(()) + } +} + +#[cfg(test)] +pub(crate) mod tests { + use super::*; + #[cfg(target_arch = "aarch64")] + use crate::builder::tests::default_vmm; + + pub(crate) fn default_device_manager() -> DeviceManager { + let mmio_bus = Arc::new(vm_device::Bus::new()); + #[cfg(target_arch = "x86_64")] + let pio_bus = Arc::new(vm_device::Bus::new()); + let mmio_devices = MMIODeviceManager::new(); + let acpi_devices = ACPIDeviceManager::new(); + let resource_allocator = ResourceAllocator::new().unwrap(); + + #[cfg(target_arch = "x86_64")] + let legacy_devices = PortIODeviceManager::new( + Arc::new(Mutex::new( + SerialDevice::new(None, SerialOut::Sink(std::io::sink())).unwrap(), + )), + Arc::new(Mutex::new(I8042Device::new( + EventFd::new(libc::EFD_NONBLOCK).unwrap(), + EventFd::new(libc::EFD_NONBLOCK).unwrap(), + ))), + ) + .unwrap(); + + DeviceManager { + resource_allocator, + mmio_bus, + mmio_devices, + #[cfg(target_arch = "x86_64")] + pio_bus, + #[cfg(target_arch = "x86_64")] + legacy_devices, + acpi_devices, + } + } + + #[cfg(target_arch = "aarch64")] + #[test] + fn test_attach_legacy_serial() { + let mut vmm = default_vmm(); + assert!(vmm.device_manager.mmio_devices.rtc.is_none()); + assert!(vmm.device_manager.mmio_devices.serial.is_none()); + + let mut cmdline = Cmdline::new(4096).unwrap(); + let mut event_manager = EventManager::new().unwrap(); + vmm.device_manager + .attach_legacy_devices_aarch64(vmm.vm.fd(), &mut event_manager, &mut cmdline) + .unwrap(); + assert!(vmm.device_manager.mmio_devices.rtc.is_some()); + assert!(vmm.device_manager.mmio_devices.serial.is_none()); + + let mut vmm = default_vmm(); + cmdline.insert("console", "/dev/blah").unwrap(); + vmm.device_manager + .attach_legacy_devices_aarch64(vmm.vm.fd(), &mut event_manager, &mut cmdline) + .unwrap(); + assert!(vmm.device_manager.mmio_devices.rtc.is_some()); + assert!(vmm.device_manager.mmio_devices.serial.is_some()); + + assert!( + cmdline + .as_cstring() + .unwrap() + .into_string() + .unwrap() + .contains(&format!( + "earlycon=uart,mmio,0x{:08x}", + vmm.device_manager + .mmio_devices + .serial + .as_ref() + .unwrap() + .resources + .addr + )) + ); + } +} diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 6a54a67b33d..c0288b03a59 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -63,6 +63,8 @@ pub enum DevicePersistError { DeviceManager(#[from] super::mmio::MmioError), /// Mmio transport MmioTransport, + /// Bus error: {0} + Bus(#[from] vm_device::BusError), #[cfg(target_arch = "aarch64")] /// Legacy: {0} Legacy(#[from] std::io::Error), @@ -193,6 +195,7 @@ pub enum SharedDeviceType { } pub struct MMIODevManagerConstructorArgs<'a> { + pub mmio_bus: &'a vm_device::Bus, pub mem: &'a GuestMemoryMmap, pub vm: &'a VmFd, pub event_manager: &'a mut EventManager, @@ -425,13 +428,14 @@ impl<'a> Persist<'a> for MMIODeviceManager { dev_manager.register_mmio_serial( vm, + constructor_args.mmio_bus, constructor_args.resource_allocator, serial, Some(state.device_info), )?; } if state.type_ == DeviceType::Rtc { - let rtc = RTCDevice::new(); + let rtc = Arc::new(Mutex::new(RTCDevice::new())); constructor_args .resource_allocator .allocate_mmio_memory( @@ -443,6 +447,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { DevicePersistError::DeviceManager(super::mmio::MmioError::Allocator(e)) })?; dev_manager.register_mmio_rtc( + constructor_args.mmio_bus, constructor_args.resource_allocator, rtc, Some(state.device_info), @@ -458,6 +463,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { state: &MmioTransportState, interrupt: Arc, device_info: &MMIODeviceInfo, + mmio_bus: &vm_device::Bus, event_manager: &mut EventManager| -> Result<(), Self::Error> { let restore_args = MmioTransportConstructorArgs { @@ -466,8 +472,10 @@ impl<'a> Persist<'a> for MMIODeviceManager { device, is_vhost_user, }; - let mmio_transport = MmioTransport::restore(restore_args, state) - .map_err(|()| DevicePersistError::MmioTransport)?; + let mmio_transport = Arc::new(Mutex::new( + MmioTransport::restore(restore_args, state) + .map_err(|()| DevicePersistError::MmioTransport)?, + )); // We do not currently require exact re-allocation of IDs via // `dev_manager.irq_allocator.allocate_id()` and currently cannot do @@ -490,7 +498,15 @@ impl<'a> Persist<'a> for MMIODeviceManager { DevicePersistError::DeviceManager(super::mmio::MmioError::Allocator(e)) })?; - dev_manager.register_mmio_virtio(vm, id.clone(), mmio_transport, device_info)?; + dev_manager.register_mmio_virtio( + vm, + id.clone(), + mmio_bus, + MMIODevice { + resources: *device_info, + inner: mmio_transport, + }, + )?; event_manager.add_subscriber(as_subscriber); Ok(()) @@ -519,6 +535,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { &balloon_state.transport_state, interrupt, &balloon_state.device_info, + constructor_args.mmio_bus, constructor_args.event_manager, )?; } @@ -545,6 +562,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { &block_state.transport_state, interrupt, &block_state.device_info, + constructor_args.mmio_bus, constructor_args.event_manager, )?; } @@ -586,6 +604,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { &net_state.transport_state, interrupt, &net_state.device_info, + constructor_args.mmio_bus, constructor_args.event_manager, )?; } @@ -617,6 +636,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { &vsock_state.transport_state, interrupt, &vsock_state.device_info, + constructor_args.mmio_bus, constructor_args.event_manager, )?; } @@ -642,6 +662,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { &entropy_state.transport_state, interrupt, &entropy_state.device_info, + constructor_args.mmio_bus, constructor_args.event_manager, )?; } @@ -701,18 +722,6 @@ mod tests { } } - impl MMIODeviceManager { - fn soft_clone(&self) -> Self { - // We can unwrap here as we create with values directly in scope we - // know will results in `Ok` - let mut clone = MMIODeviceManager::new(); - // We only care about the device hashmap. - clone.virtio_devices.clone_from(&self.virtio_devices); - clone.boot_timer = self.boot_timer.clone(); - clone - } - } - impl PartialEq for MMIODevice { fn eq(&self, other: &Self) -> bool { self.resources == other.resources @@ -745,7 +754,7 @@ mod tests { let mut resource_allocator = ResourceAllocator::new().unwrap(); tmp_sock_file.remove().unwrap(); // Set up a vmm with one of each device, and get the serialized DeviceStates. - let original_mmio_device_manager = { + { let mut event_manager = EventManager::new().expect("Unable to create EventManager"); let mut vmm = default_vmm(); let mut cmdline = default_kernel_cmdline(); @@ -795,11 +804,9 @@ mod tests { let entropy_config = EntropyDeviceConfig::default(); insert_entropy_device(&mut vmm, &mut cmdline, &mut event_manager, entropy_config); - Snapshot::serialize(&mut buf.as_mut_slice(), &vmm.mmio_device_manager.save()).unwrap(); + Snapshot::serialize(&mut buf.as_mut_slice(), &vmm.device_manager.save()).unwrap(); + } - // We only want to keep the device map from the original MmioDeviceManager. - vmm.mmio_device_manager.soft_clone() - }; tmp_sock_file.remove().unwrap(); let mut event_manager = EventManager::new().expect("Unable to create EventManager"); @@ -807,6 +814,7 @@ mod tests { let device_states: DeviceStates = Snapshot::deserialize(&mut buf.as_slice()).unwrap(); let vm_resources = &mut VmResources::default(); let restore_args = MMIODevManagerConstructorArgs { + mmio_bus: &vmm.device_manager.mmio_bus, mem: vmm.vm.guest_memory(), vm: vmm.vm.fd(), event_manager: &mut event_manager, @@ -815,7 +823,7 @@ mod tests { instance_id: "microvm-id", restored_from_file: true, }; - let restored_dev_manager = + let _restored_dev_manager = MMIODeviceManager::restore(restore_args, &device_states).unwrap(); let expected_vm_resources = format!( @@ -893,8 +901,6 @@ mod tests { MmdsVersion::V2 ); assert_eq!(device_states.mmds.unwrap().version, MmdsVersion::V2); - - assert_eq!(restored_dev_manager, original_mmio_device_manager); assert_eq!( expected_vm_resources, serde_json::to_string_pretty(&VmmConfig::from(&*vm_resources)).unwrap() diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 1230b337d35..b8aca60a00c 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -121,8 +121,7 @@ use std::sync::mpsc::RecvTimeoutError; use std::sync::{Arc, Barrier, Mutex}; use std::time::Duration; -use device_manager::acpi::ACPIDeviceManager; -use device_manager::resources::ResourceAllocator; +use device_manager::DeviceManager; use devices::acpi::vmgenid::VmGenIdError; use devices::virtio::device::VirtioDevice; use event_manager::{EventManager as BaseEventManager, EventOps, Events, MutEventSubscriber}; @@ -135,10 +134,6 @@ use vstate::kvm::Kvm; use vstate::vcpu::{self, StartThreadedError, VcpuSendEventError}; use crate::cpu_config::templates::CpuConfiguration; -#[cfg(target_arch = "x86_64")] -use crate::device_manager::legacy::PortIODeviceManager; -use crate::device_manager::mmio::MMIODeviceManager; -use crate::devices::legacy::{IER_RDA_BIT, IER_RDA_OFFSET}; use crate::devices::virtio::balloon::{ BALLOON_DEV_ID, Balloon, BalloonConfig, BalloonError, BalloonStats, }; @@ -148,7 +143,6 @@ use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET}; use crate::logger::{METRICS, MetricsError, error, info, warn}; use crate::persist::{MicrovmState, MicrovmStateError, VmInfo}; use crate::rate_limiter::BucketUpdate; -use crate::snapshot::Persist; use crate::vmm_config::instance_info::{InstanceInfo, VmState}; use crate::vstate::memory::{GuestMemory, GuestMemoryMmap, GuestMemoryRegion}; use crate::vstate::vcpu::VcpuState; @@ -205,17 +199,15 @@ pub const HTTP_MAX_PAYLOAD_SIZE: usize = 51200; /// have permissions to open the KVM fd). #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum VmmError { - /// Failed to allocate guest resource: {0} - AllocateResources(#[from] vm_allocator::Error), #[cfg(target_arch = "aarch64")] /// Invalid command line error. Cmdline, /// Device manager error: {0} - DeviceManager(device_manager::mmio::MmioError), + DeviceManager(#[from] device_manager::DeviceManagerCreateError), + /// MMIO Device manager error: {0} + MmioDeviceManager(device_manager::mmio::MmioError), /// Error getting the KVM dirty bitmap. {0} DirtyBitmap(kvm_ioctls::Error), - /// Event fd error: {0} - EventFd(io::Error), /// I8042 error: {0} I8042Error(devices::legacy::I8042DeviceError), #[cfg(target_arch = "x86_64")] @@ -313,14 +305,8 @@ pub struct Vmm { vcpus_handles: Vec, // Used by Vcpus and devices to initiate teardown; Vmm should never write here. vcpus_exit_evt: EventFd, - - // Allocator for guest resources - resource_allocator: ResourceAllocator, - // Guest VM devices. - mmio_device_manager: MMIODeviceManager, - #[cfg(target_arch = "x86_64")] - pio_device_manager: PortIODeviceManager, - acpi_device_manager: ACPIDeviceManager, + // Device manager + device_manager: DeviceManager, } impl Vmm { @@ -346,7 +332,8 @@ impl Vmm { device_id: &str, ) -> Option>> { let device = self - .mmio_device_manager + .device_manager + .mmio_devices .get_virtio_device(device_type, device_id)?; Some(device.inner.lock().expect("Poisoned lock").device().clone()) @@ -382,10 +369,10 @@ impl Vmm { self.vcpus_handles.reserve(vcpu_count); for mut vcpu in vcpus.drain(..) { - vcpu.set_mmio_bus(self.mmio_device_manager.bus.clone()); + vcpu.set_mmio_bus(self.device_manager.mmio_bus.clone()); #[cfg(target_arch = "x86_64")] vcpu.kvm_vcpu - .set_pio_bus(self.pio_device_manager.io_bus.clone()); + .set_pio_bus(self.device_manager.pio_bus.clone()); self.vcpus_handles .push(vcpu.start_threaded(vcpu_seccomp_filter.clone(), barrier.clone())?); @@ -399,7 +386,7 @@ impl Vmm { /// Sends a resume command to the vCPUs. pub fn resume_vm(&mut self) -> Result<(), VmmError> { - self.mmio_device_manager.kick_devices(); + self.device_manager.mmio_devices.kick_devices(); // Send the events. self.vcpus_handles @@ -443,48 +430,11 @@ impl Vmm { Ok(()) } - /// Sets RDA bit in serial console - pub fn emulate_serial_init(&self) -> Result<(), EmulateSerialInitError> { - // When restoring from a previously saved state, there is no serial - // driver initialization, therefore the RDA (Received Data Available) - // interrupt is not enabled. Because of that, the driver won't get - // notified of any bytes that we send to the guest. The clean solution - // would be to save the whole serial device state when we do the vm - // serialization. For now we set that bit manually - - #[cfg(target_arch = "aarch64")] - { - if let Some(device) = &self.mmio_device_manager.serial { - let mut device_locked = device.inner.lock().expect("Poisoned lock"); - - device_locked - .serial - .write(IER_RDA_OFFSET, IER_RDA_BIT) - .map_err(|_| EmulateSerialInitError(std::io::Error::last_os_error()))?; - } - Ok(()) - } - - #[cfg(target_arch = "x86_64")] - { - let mut serial = self - .pio_device_manager - .stdio_serial - .lock() - .expect("Poisoned lock"); - - serial - .serial - .write(IER_RDA_OFFSET, IER_RDA_BIT) - .map_err(|_| EmulateSerialInitError(std::io::Error::last_os_error()))?; - Ok(()) - } - } - /// Injects CTRL+ALT+DEL keystroke combo in the i8042 device. #[cfg(target_arch = "x86_64")] pub fn send_ctrl_alt_del(&mut self) -> Result<(), VmmError> { - self.pio_device_manager + self.device_manager + .legacy_devices .i8042 .lock() .expect("i8042 lock was poisoned") @@ -509,9 +459,7 @@ impl Vmm { self.vm.save_state(&mpidrs).map_err(SaveVmState)? } }; - let device_states = self.mmio_device_manager.save(); - - let acpi_dev_state = self.acpi_device_manager.save(); + let device_states = self.device_manager.save(); Ok(MicrovmState { vm_info: vm_info.clone(), @@ -519,7 +467,6 @@ impl Vmm { vm_state, vcpu_states, device_states, - acpi_dev_state, }) } @@ -586,13 +533,14 @@ impl Vmm { drive_id: &str, path_on_host: String, ) -> Result<(), VmmError> { - self.mmio_device_manager + self.device_manager + .mmio_devices .with_virtio_device_with_id(TYPE_BLOCK, drive_id, |block: &mut Block| { block .update_disk_image(path_on_host) .map_err(|err| err.to_string()) }) - .map_err(VmmError::DeviceManager) + .map_err(VmmError::MmioDeviceManager) } /// Updates the rate limiter parameters for block device with `drive_id` id. @@ -602,22 +550,24 @@ impl Vmm { rl_bytes: BucketUpdate, rl_ops: BucketUpdate, ) -> Result<(), VmmError> { - self.mmio_device_manager + self.device_manager + .mmio_devices .with_virtio_device_with_id(TYPE_BLOCK, drive_id, |block: &mut Block| { block .update_rate_limiter(rl_bytes, rl_ops) .map_err(|err| err.to_string()) }) - .map_err(VmmError::DeviceManager) + .map_err(VmmError::MmioDeviceManager) } /// Updates the rate limiter parameters for block device with `drive_id` id. pub fn update_vhost_user_block_config(&mut self, drive_id: &str) -> Result<(), VmmError> { - self.mmio_device_manager + self.device_manager + .mmio_devices .with_virtio_device_with_id(TYPE_BLOCK, drive_id, |block: &mut Block| { block.update_config().map_err(|err| err.to_string()) }) - .map_err(VmmError::DeviceManager) + .map_err(VmmError::MmioDeviceManager) } /// Updates the rate limiter parameters for net device with `net_id` id. @@ -629,12 +579,13 @@ impl Vmm { tx_bytes: BucketUpdate, tx_ops: BucketUpdate, ) -> Result<(), VmmError> { - self.mmio_device_manager + self.device_manager + .mmio_devices .with_virtio_device_with_id(TYPE_NET, net_id, |net: &mut Net| { net.patch_rate_limiters(rx_bytes, rx_ops, tx_bytes, tx_ops); Ok(()) }) - .map_err(VmmError::DeviceManager) + .map_err(VmmError::MmioDeviceManager) } /// Returns a reference to the balloon device if present. diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index 14af3ecd792..f8b02a36876 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -25,7 +25,7 @@ use crate::cpu_config::templates::StaticCpuTemplate; use crate::cpu_config::x86_64::cpuid::CpuidTrait; #[cfg(target_arch = "x86_64")] use crate::cpu_config::x86_64::cpuid::common::get_vendor_id_from_host; -use crate::device_manager::persist::{ACPIDeviceManagerState, DevicePersistError, DeviceStates}; +use crate::device_manager::{DevicePersistError, DevicesState}; use crate::logger::{info, warn}; use crate::resources::VmResources; use crate::seccomp::BpfThreadMap; @@ -81,9 +81,7 @@ pub struct MicrovmState { /// Vcpu states. pub vcpu_states: Vec, /// Device states. - pub device_states: DeviceStates, - /// ACPI devices state. - pub acpi_dev_state: ACPIDeviceManagerState, + pub device_states: DevicesState, } /// This describes the mapping between Firecracker base virtual address and @@ -118,7 +116,7 @@ pub enum MicrovmStateError { /// Operation not allowed: {0} NotAllowed(String), /// Cannot restore devices: {0} - RestoreDevices(DevicePersistError), + RestoreDevices(#[from] DevicePersistError), /// Cannot save Vcpu state: {0} SaveVcpuState(vstate::vcpu::VcpuError), /// Cannot save Vm state: {0} @@ -171,7 +169,8 @@ pub fn create_snapshot( // SAFETY: // This should never fail as we only mark pages only if device has already been activated, // and the address validation was already performed on device activation. - vmm.mmio_device_manager + vmm.device_manager + .mmio_devices .for_each_virtio_device(|_, _, device| { let mmio_dev_locked = device.inner.lock().expect("Poisoned lock"); let mut d = mmio_dev_locked.locked_device(); @@ -335,7 +334,7 @@ pub fn restore_from_snapshot( ) -> Result>, RestoreFromSnapshotError> { let mut microvm_state = snapshot_state_from_file(¶ms.snapshot_path)?; for entry in ¶ms.network_overrides { - let net_devices = &mut microvm_state.device_states.net_devices; + let net_devices = &mut microvm_state.device_states.mmio_state.net_devices; if let Some(device) = net_devices .iter_mut() .find(|x| x.device_state.id == entry.iface_id) @@ -600,7 +599,6 @@ mod tests { #[cfg(target_arch = "aarch64")] use crate::construct_kvm_mpidrs; use crate::devices::virtio::block::CacheType; - use crate::snapshot::Persist; use crate::vmm_config::balloon::BalloonDeviceConfig; use crate::vmm_config::net::NetworkInterfaceConfig; use crate::vmm_config::vsock::tests::default_config; @@ -661,14 +659,14 @@ mod tests { #[test] fn test_microvm_state_snapshot() { let vmm = default_vmm_with_devices(); - let states = vmm.mmio_device_manager.save(); + let states = vmm.device_manager.save(); // Only checking that all devices are saved, actual device state // is tested by that device's tests. - assert_eq!(states.block_devices.len(), 1); - assert_eq!(states.net_devices.len(), 1); - assert!(states.vsock_device.is_some()); - assert!(states.balloon_device.is_some()); + assert_eq!(states.mmio_state.block_devices.len(), 1); + assert_eq!(states.mmio_state.net_devices.len(), 1); + assert!(states.mmio_state.vsock_device.is_some()); + assert!(states.mmio_state.balloon_device.is_some()); let vcpu_states = vec![VcpuState::default()]; #[cfg(target_arch = "aarch64")] @@ -685,7 +683,6 @@ mod tests { vm_state: vmm.vm.save_state(&mpidrs).unwrap(), #[cfg(target_arch = "x86_64")] vm_state: vmm.vm.save_state().unwrap(), - acpi_dev_state: vmm.acpi_device_manager.save(), }; let mut buf = vec![0; 10000]; @@ -696,8 +693,8 @@ mod tests { assert_eq!(restored_microvm_state.vm_info, microvm_state.vm_info); assert_eq!( - restored_microvm_state.device_states, - microvm_state.device_states + restored_microvm_state.device_states.mmio_state, + microvm_state.device_states.mmio_state ) } diff --git a/src/vmm/tests/integration_tests.rs b/src/vmm/tests/integration_tests.rs index 2c25f3f17c3..02612743beb 100644 --- a/src/vmm/tests/integration_tests.rs +++ b/src/vmm/tests/integration_tests.rs @@ -220,9 +220,29 @@ fn verify_create_snapshot(is_diff: bool) -> (TempFile, TempFile) { // Verify deserialized data. // The default vmm has no devices and one vCPU. - assert_eq!(restored_microvm_state.device_states.block_devices.len(), 0); - assert_eq!(restored_microvm_state.device_states.net_devices.len(), 0); - assert!(restored_microvm_state.device_states.vsock_device.is_none()); + assert_eq!( + restored_microvm_state + .device_states + .mmio_state + .block_devices + .len(), + 0 + ); + assert_eq!( + restored_microvm_state + .device_states + .mmio_state + .net_devices + .len(), + 0 + ); + assert!( + restored_microvm_state + .device_states + .mmio_state + .vsock_device + .is_none() + ); assert_eq!(restored_microvm_state.vcpu_states.len(), 1); (snapshot_file, memory_file) diff --git a/tests/integration_tests/functional/test_api.py b/tests/integration_tests/functional/test_api.py index e241d4ef1c7..1cc5d3c6c61 100644 --- a/tests/integration_tests/functional/test_api.py +++ b/tests/integration_tests/functional/test_api.py @@ -772,7 +772,7 @@ def test_send_ctrl_alt_del(uvm_plain_any): def _drive_patch(test_microvm, io_engine): """Exercise drive patch test scenarios.""" # Patches without mandatory fields for virtio block are not allowed. - expected_msg = "Unable to patch the block device: Device manager error: Running method expected different backend. Please verify the request arguments" + expected_msg = "Unable to patch the block device: MMIO Device manager error: Running method expected different backend. Please verify the request arguments" with pytest.raises(RuntimeError, match=expected_msg): test_microvm.api.drive.patch(drive_id="scratch") @@ -814,7 +814,7 @@ def _drive_patch(test_microvm, io_engine): ) # Updates to `path_on_host` with an invalid path are not allowed. - expected_msg = f"Unable to patch the block device: Device manager error: Virtio backend error: Error manipulating the backing file: No such file or directory (os error 2) {drive_path} Please verify the request arguments" + expected_msg = f"Unable to patch the block device: MMIO Device manager error: Virtio backend error: Error manipulating the backing file: No such file or directory (os error 2) {drive_path} Please verify the request arguments" with pytest.raises(RuntimeError, match=re.escape(expected_msg)): test_microvm.api.drive.patch(drive_id="scratch", path_on_host=drive_path) From ac38c8f0cd02f4cc49ef6d93ab1aea1f6b237543 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 7 May 2025 10:26:38 +0200 Subject: [PATCH 12/99] refactor: simplify creation of I8042 device We always create anew the keyboard interrupt event. Just create it inside `I8042Device::new()` and return an error if that fails. Signed-off-by: Babis Chalios --- src/vmm/src/device_manager/legacy.rs | 7 +++---- src/vmm/src/device_manager/mod.rs | 12 ++++-------- src/vmm/src/devices/legacy/i8042.rs | 28 ++++++++-------------------- 3 files changed, 15 insertions(+), 32 deletions(-) diff --git a/src/vmm/src/device_manager/legacy.rs b/src/vmm/src/device_manager/legacy.rs index a2866f14415..cedb7abc32c 100644 --- a/src/vmm/src/device_manager/legacy.rs +++ b/src/vmm/src/device_manager/legacy.rs @@ -259,10 +259,9 @@ mod tests { ), input: None, })), - Arc::new(Mutex::new(I8042Device::new( - EventFd::new(libc::EFD_NONBLOCK).unwrap(), - EventFd::new(libc::EFD_NONBLOCK).unwrap(), - ))), + Arc::new(Mutex::new( + I8042Device::new(EventFd::new(libc::EFD_NONBLOCK).unwrap()).unwrap(), + )), ) .unwrap(); ldm.register_devices(&io_bus, vm.fd()).unwrap(); diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index 8aec41ffa11..3e3f0f0ffda 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -159,10 +159,7 @@ impl DeviceManager { .try_clone() .map_err(DeviceManagerCreateError::EventFd)?; // Create keyboard emulator for reset event - let i8042 = Arc::new(Mutex::new(I8042Device::new( - reset_evt, - EventFd::new(libc::EFD_NONBLOCK).map_err(DeviceManagerCreateError::EventFd)?, - ))); + let i8042 = Arc::new(Mutex::new(I8042Device::new(reset_evt)?)); // create pio dev manager with legacy devices let mut legacy_devices = PortIODeviceManager::new(serial, i8042)?; @@ -405,10 +402,9 @@ pub(crate) mod tests { Arc::new(Mutex::new( SerialDevice::new(None, SerialOut::Sink(std::io::sink())).unwrap(), )), - Arc::new(Mutex::new(I8042Device::new( - EventFd::new(libc::EFD_NONBLOCK).unwrap(), - EventFd::new(libc::EFD_NONBLOCK).unwrap(), - ))), + Arc::new(Mutex::new( + I8042Device::new(EventFd::new(libc::EFD_NONBLOCK).unwrap()).unwrap(), + )), ) .unwrap(); diff --git a/src/vmm/src/devices/legacy/i8042.rs b/src/vmm/src/devices/legacy/i8042.rs index 1bc830bd13b..235ce2a7339 100644 --- a/src/vmm/src/devices/legacy/i8042.rs +++ b/src/vmm/src/devices/legacy/i8042.rs @@ -119,10 +119,10 @@ pub struct I8042Device { impl I8042Device { /// Constructs an i8042 device that will signal the given event when the guest requests it. - pub fn new(reset_evt: EventFd, kbd_interrupt_evt: EventFd) -> I8042Device { - I8042Device { + pub fn new(reset_evt: EventFd) -> Result { + Ok(I8042Device { reset_evt, - kbd_interrupt_evt, + kbd_interrupt_evt: EventFd::new(libc::EFD_NONBLOCK)?, control: CB_POST_OK | CB_KBD_INT, cmd: 0, outp: 0, @@ -130,7 +130,7 @@ impl I8042Device { buf: [0; BUF_SIZE], bhead: Wrapping(0), btail: Wrapping(0), - } + }) } /// Signal a ctrl-alt-del (reset) event. @@ -355,10 +355,7 @@ mod tests { #[test] fn test_i8042_read_write_and_event() { - let mut i8042 = I8042Device::new( - EventFd::new(libc::EFD_NONBLOCK).unwrap(), - EventFd::new(libc::EFD_NONBLOCK).unwrap(), - ); + let mut i8042 = I8042Device::new(EventFd::new(libc::EFD_NONBLOCK).unwrap()).unwrap(); let reset_evt = i8042.reset_evt.try_clone().unwrap(); // Check if reading in a 2-length array doesn't have side effects. @@ -395,10 +392,7 @@ mod tests { #[test] fn test_i8042_commands() { - let mut i8042 = I8042Device::new( - EventFd::new(libc::EFD_NONBLOCK).unwrap(), - EventFd::new(libc::EFD_NONBLOCK).unwrap(), - ); + let mut i8042 = I8042Device::new(EventFd::new(libc::EFD_NONBLOCK).unwrap()).unwrap(); let mut data = [1]; // Test reading/writing the control register. @@ -435,10 +429,7 @@ mod tests { #[test] fn test_i8042_buffer() { - let mut i8042 = I8042Device::new( - EventFd::new(libc::EFD_NONBLOCK).unwrap(), - EventFd::new(libc::EFD_NONBLOCK).unwrap(), - ); + let mut i8042 = I8042Device::new(EventFd::new(libc::EFD_NONBLOCK).unwrap()).unwrap(); // Test push/pop. i8042.push_byte(52).unwrap(); @@ -462,10 +453,7 @@ mod tests { #[test] fn test_i8042_kbd() { - let mut i8042 = I8042Device::new( - EventFd::new(libc::EFD_NONBLOCK).unwrap(), - EventFd::new(libc::EFD_NONBLOCK).unwrap(), - ); + let mut i8042 = I8042Device::new(EventFd::new(libc::EFD_NONBLOCK).unwrap()).unwrap(); fn expect_key(i8042: &mut I8042Device, key: u16) { let mut data = [1]; From 695b1cc7f4c51ae4ad9693af7c626fdfd152565c Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Mon, 12 May 2025 21:12:34 +0200 Subject: [PATCH 13/99] test: add network interface to test_serial_dos test test_serial_dos test checks that when we send a lot of bytes in the serial device the emulation logic does not increase indefinitely the underlying buffer that we use for when the device is set in loopback mode. However, the test does not wait for the microVM to start and sometimes the virtual memory allocation may increase between readings. Add a network device to the microVM so that we implicitly wait until it has booted before taking the first measurement. Signed-off-by: Babis Chalios --- tests/integration_tests/functional/test_serial_io.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration_tests/functional/test_serial_io.py b/tests/integration_tests/functional/test_serial_io.py index 7d7939a064e..01900ec55e0 100644 --- a/tests/integration_tests/functional/test_serial_io.py +++ b/tests/integration_tests/functional/test_serial_io.py @@ -148,6 +148,7 @@ def test_serial_dos(uvm_plain_any): vcpu_count=1, boot_args="console=ttyS0 reboot=k panic=1 pci=off", ) + microvm.add_net_iface() microvm.start() # Open an fd for firecracker process terminal. From 59064b9f1aafabeb4a780e797d79fa59d79981c1 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Mon, 28 Apr 2025 11:01:54 +0200 Subject: [PATCH 14/99] pci: add pci crate from Cloud Hypervisor Bring in pci crate from cloud hypervisor with a few modifications. We use the rust-vmm vm-allocator crate instead of Cloud Hypervisor's downstream one. For the time being, rust-vmm's implementation should include all we need for supporting the devices we care about. If we need more functionality from our allocators, we will implement the logic directly in the rust-vmm vm-allocator crate. Signed-off-by: Babis Chalios --- Cargo.lock | 22 + src/pci/Cargo.toml | 25 + src/pci/src/bus.rs | 477 +++++++++++++ src/pci/src/configuration.rs | 1252 ++++++++++++++++++++++++++++++++++ src/pci/src/device.rs | 136 ++++ src/pci/src/lib.rs | 198 ++++++ src/pci/src/msi.rs | 282 ++++++++ src/pci/src/msix.rs | 552 +++++++++++++++ src/vmm/Cargo.toml | 3 + 9 files changed, 2947 insertions(+) create mode 100644 src/pci/Cargo.toml create mode 100644 src/pci/src/bus.rs create mode 100644 src/pci/src/configuration.rs create mode 100644 src/pci/src/device.rs create mode 100644 src/pci/src/lib.rs create mode 100644 src/pci/src/msi.rs create mode 100644 src/pci/src/msix.rs diff --git a/Cargo.lock b/Cargo.lock index aff0432be7f..0e44d6a7596 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -250,6 +250,12 @@ version = "3.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "cargo_toml" version = "0.22.3" @@ -1027,6 +1033,20 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" +[[package]] +name = "pci" +version = "0.1.0" +dependencies = [ + "byteorder", + "libc", + "log", + "serde", + "thiserror 2.0.12", + "vm-allocator", + "vm-device", + "vm-memory", +] + [[package]] name = "peeking_take_while" version = "0.1.2" @@ -1644,6 +1664,7 @@ dependencies = [ "log-instrument", "memfd", "micro_http", + "pci", "proptest", "semver", "serde", @@ -1653,6 +1674,7 @@ dependencies = [ "timerfd", "userfaultfd", "utils", + "uuid", "vhost", "vm-allocator", "vm-device", diff --git a/src/pci/Cargo.toml b/src/pci/Cargo.toml new file mode 100644 index 00000000000..c88cd270b23 --- /dev/null +++ b/src/pci/Cargo.toml @@ -0,0 +1,25 @@ +[package] +authors = ["Samuel Ortiz "] +edition = "2021" +name = "pci" +version = "0.1.0" +license = "Apache-2.0 AND BSD-3-Clause" + +[lib] +bench = false + +[features] +default = [] + +[dependencies] +byteorder = "1.5.0" +libc = "0.2.172" +log = "0.4.27" +serde = { version = "1.0.219", features = ["derive"] } +thiserror = "2.0.12" +vm-allocator = "0.1.2" +vm-device = { path = "../vm-device" } +vm-memory = { version = "0.16.1", features = [ + "backend-mmap", + "backend-bitmap", +] } diff --git a/src/pci/src/bus.rs b/src/pci/src/bus.rs new file mode 100644 index 00000000000..cb42b4ee9c5 --- /dev/null +++ b/src/pci/src/bus.rs @@ -0,0 +1,477 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +use std::any::Any; +use std::collections::HashMap; +use std::ops::DerefMut; +use std::sync::{Arc, Barrier, Mutex}; + +use byteorder::{ByteOrder, LittleEndian}; +use vm_device::{Bus, BusDevice, BusDeviceSync}; + +use crate::configuration::{ + PciBarRegionType, PciBridgeSubclass, PciClassCode, PciConfiguration, PciHeaderType, +}; +use crate::device::{DeviceRelocation, Error as PciDeviceError, PciDevice}; +use crate::PciBarConfiguration; + +const VENDOR_ID_INTEL: u16 = 0x8086; +const DEVICE_ID_INTEL_VIRT_PCIE_HOST: u16 = 0x0d57; +const NUM_DEVICE_IDS: usize = 32; + +/// Errors for device manager. +#[derive(Debug)] +pub enum PciRootError { + /// Could not allocate device address space for the device. + AllocateDeviceAddrs(PciDeviceError), + /// Could not allocate an IRQ number. + AllocateIrq, + /// Could not add a device to the port io bus. + PioInsert(vm_device::BusError), + /// Could not add a device to the mmio bus. + MmioInsert(vm_device::BusError), + /// Could not find an available device slot on the PCI bus. + NoPciDeviceSlotAvailable, + /// Invalid PCI device identifier provided. + InvalidPciDeviceSlot(usize), + /// Valid PCI device identifier but already used. + AlreadyInUsePciDeviceSlot(usize), +} +pub type Result = std::result::Result; + +/// Emulates the PCI Root bridge device. +pub struct PciRoot { + /// Configuration space. + config: PciConfiguration, +} + +impl PciRoot { + /// Create an empty PCI root bridge. + pub fn new(config: Option) -> Self { + if let Some(config) = config { + PciRoot { config } + } else { + PciRoot { + config: PciConfiguration::new( + VENDOR_ID_INTEL, + DEVICE_ID_INTEL_VIRT_PCIE_HOST, + 0, + PciClassCode::BridgeDevice, + &PciBridgeSubclass::HostBridge, + None, + PciHeaderType::Device, + 0, + 0, + None, + None, + ), + } + } + } +} + +impl BusDevice for PciRoot {} + +impl PciDevice for PciRoot { + fn write_config_register( + &mut self, + reg_idx: usize, + offset: u64, + data: &[u8], + ) -> Option> { + self.config.write_config_register(reg_idx, offset, data); + None + } + + fn read_config_register(&mut self, reg_idx: usize) -> u32 { + self.config.read_reg(reg_idx) + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + fn id(&self) -> Option { + None + } +} + +pub struct PciBus { + /// Devices attached to this bus. + /// Device 0 is host bridge. + devices: HashMap>>, + device_reloc: Arc, + device_ids: Vec, +} + +impl PciBus { + pub fn new(pci_root: PciRoot, device_reloc: Arc) -> Self { + let mut devices: HashMap>> = HashMap::new(); + let mut device_ids: Vec = vec![false; NUM_DEVICE_IDS]; + + devices.insert(0, Arc::new(Mutex::new(pci_root))); + device_ids[0] = true; + + PciBus { + devices, + device_reloc, + device_ids, + } + } + + pub fn register_mapping( + &self, + dev: Arc, + io_bus: &Bus, + mmio_bus: &Bus, + bars: Vec, + ) -> Result<()> { + for bar in bars { + match bar.region_type() { + PciBarRegionType::IoRegion => { + io_bus + .insert(dev.clone(), bar.addr(), bar.size()) + .map_err(PciRootError::PioInsert)?; + } + PciBarRegionType::Memory32BitRegion | PciBarRegionType::Memory64BitRegion => { + mmio_bus + .insert(dev.clone(), bar.addr(), bar.size()) + .map_err(PciRootError::MmioInsert)?; + } + } + } + Ok(()) + } + + pub fn add_device(&mut self, device_id: u32, device: Arc>) -> Result<()> { + self.devices.insert(device_id, device); + Ok(()) + } + + pub fn remove_by_device(&mut self, device: &Arc>) -> Result<()> { + self.devices.retain(|_, dev| !Arc::ptr_eq(dev, device)); + Ok(()) + } + + pub fn next_device_id(&mut self) -> Result { + for (idx, device_id) in self.device_ids.iter_mut().enumerate() { + if !(*device_id) { + *device_id = true; + return Ok(idx as u32); + } + } + + Err(PciRootError::NoPciDeviceSlotAvailable) + } + + pub fn get_device_id(&mut self, id: usize) -> Result<()> { + if id < NUM_DEVICE_IDS { + if !self.device_ids[id] { + self.device_ids[id] = true; + Ok(()) + } else { + Err(PciRootError::AlreadyInUsePciDeviceSlot(id)) + } + } else { + Err(PciRootError::InvalidPciDeviceSlot(id)) + } + } + + pub fn put_device_id(&mut self, id: usize) -> Result<()> { + if id < NUM_DEVICE_IDS { + self.device_ids[id] = false; + Ok(()) + } else { + Err(PciRootError::InvalidPciDeviceSlot(id)) + } + } +} + +pub struct PciConfigIo { + /// Config space register. + config_address: u32, + pci_bus: Arc>, +} + +impl PciConfigIo { + pub fn new(pci_bus: Arc>) -> Self { + PciConfigIo { + config_address: 0, + pci_bus, + } + } + + pub fn config_space_read(&self) -> u32 { + let enabled = (self.config_address & 0x8000_0000) != 0; + if !enabled { + return 0xffff_ffff; + } + + let (bus, device, function, register) = + parse_io_config_address(self.config_address & !0x8000_0000); + + // Only support one bus. + if bus != 0 { + return 0xffff_ffff; + } + + // Don't support multi-function devices. + if function > 0 { + return 0xffff_ffff; + } + + self.pci_bus + .as_ref() + .lock() + .unwrap() + .devices + .get(&(device as u32)) + .map_or(0xffff_ffff, |d| { + d.lock().unwrap().read_config_register(register) + }) + } + + pub fn config_space_write(&mut self, offset: u64, data: &[u8]) -> Option> { + if offset as usize + data.len() > 4 { + return None; + } + + let enabled = (self.config_address & 0x8000_0000) != 0; + if !enabled { + return None; + } + + let (bus, device, _function, register) = + parse_io_config_address(self.config_address & !0x8000_0000); + + // Only support one bus. + if bus != 0 { + return None; + } + + let pci_bus = self.pci_bus.as_ref().lock().unwrap(); + if let Some(d) = pci_bus.devices.get(&(device as u32)) { + let mut device = d.lock().unwrap(); + + // Find out if one of the device's BAR is being reprogrammed, and + // reprogram it if needed. + if let Some(params) = device.detect_bar_reprogramming(register, data) { + if let Err(e) = pci_bus.device_reloc.move_bar( + params.old_base, + params.new_base, + params.len, + device.deref_mut(), + params.region_type, + ) { + error!( + "Failed moving device BAR: {}: 0x{:x}->0x{:x}(0x{:x})", + e, params.old_base, params.new_base, params.len + ); + } + } + + // Update the register value + device.write_config_register(register, offset, data) + } else { + None + } + } + + fn set_config_address(&mut self, offset: u64, data: &[u8]) { + if offset as usize + data.len() > 4 { + return; + } + let (mask, value): (u32, u32) = match data.len() { + 1 => ( + 0x0000_00ff << (offset * 8), + u32::from(data[0]) << (offset * 8), + ), + 2 => ( + 0x0000_ffff << (offset * 16), + ((u32::from(data[1]) << 8) | u32::from(data[0])) << (offset * 16), + ), + 4 => (0xffff_ffff, LittleEndian::read_u32(data)), + _ => return, + }; + self.config_address = (self.config_address & !mask) | value; + } +} + +impl BusDevice for PciConfigIo { + fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { + // `offset` is relative to 0xcf8 + let value = match offset { + 0..=3 => self.config_address, + 4..=7 => self.config_space_read(), + _ => 0xffff_ffff, + }; + + // Only allow reads to the register boundary. + let start = offset as usize % 4; + let end = start + data.len(); + if end <= 4 { + for i in start..end { + data[i - start] = (value >> (i * 8)) as u8; + } + } else { + for d in data { + *d = 0xff; + } + } + } + + fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { + // `offset` is relative to 0xcf8 + match offset { + o @ 0..=3 => { + self.set_config_address(o, data); + None + } + o @ 4..=7 => self.config_space_write(o - 4, data), + _ => None, + } + } +} + +/// Emulates PCI memory-mapped configuration access mechanism. +pub struct PciConfigMmio { + pci_bus: Arc>, +} + +impl PciConfigMmio { + pub fn new(pci_bus: Arc>) -> Self { + PciConfigMmio { pci_bus } + } + + fn config_space_read(&self, config_address: u32) -> u32 { + let (bus, device, _function, register) = parse_mmio_config_address(config_address); + + // Only support one bus. + if bus != 0 { + return 0xffff_ffff; + } + + self.pci_bus + .lock() + .unwrap() + .devices + .get(&(device as u32)) + .map_or(0xffff_ffff, |d| { + d.lock().unwrap().read_config_register(register) + }) + } + + fn config_space_write(&mut self, config_address: u32, offset: u64, data: &[u8]) { + if offset as usize + data.len() > 4 { + return; + } + + let (bus, device, _function, register) = parse_mmio_config_address(config_address); + + // Only support one bus. + if bus != 0 { + return; + } + + let pci_bus = self.pci_bus.lock().unwrap(); + if let Some(d) = pci_bus.devices.get(&(device as u32)) { + let mut device = d.lock().unwrap(); + + // Find out if one of the device's BAR is being reprogrammed, and + // reprogram it if needed. + if let Some(params) = device.detect_bar_reprogramming(register, data) { + if let Err(e) = pci_bus.device_reloc.move_bar( + params.old_base, + params.new_base, + params.len, + device.deref_mut(), + params.region_type, + ) { + error!( + "Failed moving device BAR: {}: 0x{:x}->0x{:x}(0x{:x})", + e, params.old_base, params.new_base, params.len + ); + } + } + + // Update the register value + device.write_config_register(register, offset, data); + } + } +} + +impl BusDevice for PciConfigMmio { + fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { + // Only allow reads to the register boundary. + let start = offset as usize % 4; + let end = start + data.len(); + if end > 4 || offset > u64::from(u32::MAX) { + for d in data { + *d = 0xff; + } + return; + } + + let value = self.config_space_read(offset as u32); + for i in start..end { + data[i - start] = (value >> (i * 8)) as u8; + } + } + + fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { + if offset > u64::from(u32::MAX) { + return None; + } + self.config_space_write(offset as u32, offset % 4, data); + + None + } +} + +fn shift_and_mask(value: u32, offset: usize, mask: u32) -> usize { + ((value >> offset) & mask) as usize +} + +// Parse the MMIO address offset to a (bus, device, function, register) tuple. +// See section 7.2.2 PCI Express Enhanced Configuration Access Mechanism (ECAM) +// from the Pci Express Base Specification Revision 5.0 Version 1.0. +fn parse_mmio_config_address(config_address: u32) -> (usize, usize, usize, usize) { + const BUS_NUMBER_OFFSET: usize = 20; + const BUS_NUMBER_MASK: u32 = 0x00ff; + const DEVICE_NUMBER_OFFSET: usize = 15; + const DEVICE_NUMBER_MASK: u32 = 0x1f; + const FUNCTION_NUMBER_OFFSET: usize = 12; + const FUNCTION_NUMBER_MASK: u32 = 0x07; + const REGISTER_NUMBER_OFFSET: usize = 2; + const REGISTER_NUMBER_MASK: u32 = 0x3ff; + + ( + shift_and_mask(config_address, BUS_NUMBER_OFFSET, BUS_NUMBER_MASK), + shift_and_mask(config_address, DEVICE_NUMBER_OFFSET, DEVICE_NUMBER_MASK), + shift_and_mask(config_address, FUNCTION_NUMBER_OFFSET, FUNCTION_NUMBER_MASK), + shift_and_mask(config_address, REGISTER_NUMBER_OFFSET, REGISTER_NUMBER_MASK), + ) +} + +// Parse the CONFIG_ADDRESS register to a (bus, device, function, register) tuple. +fn parse_io_config_address(config_address: u32) -> (usize, usize, usize, usize) { + const BUS_NUMBER_OFFSET: usize = 16; + const BUS_NUMBER_MASK: u32 = 0x00ff; + const DEVICE_NUMBER_OFFSET: usize = 11; + const DEVICE_NUMBER_MASK: u32 = 0x1f; + const FUNCTION_NUMBER_OFFSET: usize = 8; + const FUNCTION_NUMBER_MASK: u32 = 0x07; + const REGISTER_NUMBER_OFFSET: usize = 2; + const REGISTER_NUMBER_MASK: u32 = 0x3f; + + ( + shift_and_mask(config_address, BUS_NUMBER_OFFSET, BUS_NUMBER_MASK), + shift_and_mask(config_address, DEVICE_NUMBER_OFFSET, DEVICE_NUMBER_MASK), + shift_and_mask(config_address, FUNCTION_NUMBER_OFFSET, FUNCTION_NUMBER_MASK), + shift_and_mask(config_address, REGISTER_NUMBER_OFFSET, REGISTER_NUMBER_MASK), + ) +} diff --git a/src/pci/src/configuration.rs b/src/pci/src/configuration.rs new file mode 100644 index 00000000000..3a53167148c --- /dev/null +++ b/src/pci/src/configuration.rs @@ -0,0 +1,1252 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +use std::fmt::{self, Display}; +use std::sync::{Arc, Mutex}; + +use byteorder::{ByteOrder, LittleEndian}; +use serde::{Deserialize, Serialize}; +use vm_device::PciBarType; + +use crate::device::BarReprogrammingParams; +use crate::{MsixConfig, PciInterruptPin}; + +// The number of 32bit registers in the config space, 4096 bytes. +const NUM_CONFIGURATION_REGISTERS: usize = 1024; + +const STATUS_REG: usize = 1; +const STATUS_REG_CAPABILITIES_USED_MASK: u32 = 0x0010_0000; +const BAR0_REG: usize = 4; +const ROM_BAR_REG: usize = 12; +const ROM_BAR_IDX: usize = 6; +const BAR_IO_ADDR_MASK: u32 = 0xffff_fffc; +const BAR_MEM_ADDR_MASK: u32 = 0xffff_fff0; +const ROM_BAR_ADDR_MASK: u32 = 0xffff_f800; +const MSI_CAPABILITY_REGISTER_MASK: u32 = 0x0071_0000; +const MSIX_CAPABILITY_REGISTER_MASK: u32 = 0xc000_0000; +const NUM_BAR_REGS: usize = 6; +const CAPABILITY_LIST_HEAD_OFFSET: usize = 0x34; +const FIRST_CAPABILITY_OFFSET: usize = 0x40; +const CAPABILITY_MAX_OFFSET: usize = 192; + +const INTERRUPT_LINE_PIN_REG: usize = 15; + +pub const PCI_CONFIGURATION_ID: &str = "pci_configuration"; + +/// Represents the types of PCI headers allowed in the configuration registers. +#[derive(Copy, Clone)] +pub enum PciHeaderType { + Device, + Bridge, +} + +/// Classes of PCI nodes. +#[allow(dead_code)] +#[derive(Copy, Clone)] +pub enum PciClassCode { + TooOld, + MassStorage, + NetworkController, + DisplayController, + MultimediaController, + MemoryController, + BridgeDevice, + SimpleCommunicationController, + BaseSystemPeripheral, + InputDevice, + DockingStation, + Processor, + SerialBusController, + WirelessController, + IntelligentIoController, + EncryptionController, + DataAcquisitionSignalProcessing, + Other = 0xff, +} + +impl PciClassCode { + pub fn get_register_value(self) -> u8 { + self as u8 + } +} + +/// A PCI subclass. Each class in `PciClassCode` can specify a unique set of subclasses. This trait +/// is implemented by each subclass. It allows use of a trait object to generate configurations. +pub trait PciSubclass { + /// Convert this subclass to the value used in the PCI specification. + fn get_register_value(&self) -> u8; +} + +/// Subclasses of the MultimediaController class. +#[allow(dead_code)] +#[derive(Copy, Clone)] +pub enum PciMultimediaSubclass { + VideoController = 0x00, + AudioController = 0x01, + TelephonyDevice = 0x02, + AudioDevice = 0x03, + Other = 0x80, +} + +impl PciSubclass for PciMultimediaSubclass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +/// Subclasses of the BridgeDevice +#[allow(dead_code)] +#[derive(Copy, Clone)] +pub enum PciBridgeSubclass { + HostBridge = 0x00, + IsaBridge = 0x01, + EisaBridge = 0x02, + McaBridge = 0x03, + PciToPciBridge = 0x04, + PcmciaBridge = 0x05, + NuBusBridge = 0x06, + CardBusBridge = 0x07, + RacEwayBridge = 0x08, + PciToPciSemiTransparentBridge = 0x09, + InfiniBrandToPciHostBridge = 0x0a, + OtherBridgeDevice = 0x80, +} + +impl PciSubclass for PciBridgeSubclass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +/// Subclass of the SerialBus +#[allow(dead_code)] +#[derive(Copy, Clone)] +pub enum PciSerialBusSubClass { + Firewire = 0x00, + Accessbus = 0x01, + Ssa = 0x02, + Usb = 0x03, +} + +impl PciSubclass for PciSerialBusSubClass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +/// Mass Storage Sub Classes +#[allow(dead_code)] +#[derive(Copy, Clone)] +pub enum PciMassStorageSubclass { + ScsiStorage = 0x00, + IdeInterface = 0x01, + FloppyController = 0x02, + IpiController = 0x03, + RaidController = 0x04, + AtaController = 0x05, + SataController = 0x06, + SerialScsiController = 0x07, + NvmController = 0x08, + MassStorage = 0x80, +} + +impl PciSubclass for PciMassStorageSubclass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +/// Network Controller Sub Classes +#[allow(dead_code)] +#[derive(Copy, Clone)] +pub enum PciNetworkControllerSubclass { + EthernetController = 0x00, + TokenRingController = 0x01, + FddiController = 0x02, + AtmController = 0x03, + IsdnController = 0x04, + WorldFipController = 0x05, + PicmgController = 0x06, + InfinibandController = 0x07, + FabricController = 0x08, + NetworkController = 0x80, +} + +impl PciSubclass for PciNetworkControllerSubclass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +/// Trait to define a PCI class programming interface +/// +/// Each combination of `PciClassCode` and `PciSubclass` can specify a +/// set of register-level programming interfaces. +/// This trait is implemented by each programming interface. +/// It allows use of a trait object to generate configurations. +pub trait PciProgrammingInterface { + /// Convert this programming interface to the value used in the PCI specification. + fn get_register_value(&self) -> u8; +} + +/// Types of PCI capabilities. +#[derive(PartialEq, Eq, Copy, Clone)] +#[allow(dead_code)] +#[allow(non_camel_case_types)] +#[repr(u8)] +pub enum PciCapabilityId { + ListId = 0, + PowerManagement = 0x01, + AcceleratedGraphicsPort = 0x02, + VitalProductData = 0x03, + SlotIdentification = 0x04, + MessageSignalledInterrupts = 0x05, + CompactPciHotSwap = 0x06, + PciX = 0x07, + HyperTransport = 0x08, + VendorSpecific = 0x09, + Debugport = 0x0A, + CompactPciCentralResourceControl = 0x0B, + PciStandardHotPlugController = 0x0C, + BridgeSubsystemVendorDeviceId = 0x0D, + AgpTargetPciPcibridge = 0x0E, + SecureDevice = 0x0F, + PciExpress = 0x10, + MsiX = 0x11, + SataDataIndexConf = 0x12, + PciAdvancedFeatures = 0x13, + PciEnhancedAllocation = 0x14, +} + +impl From for PciCapabilityId { + fn from(c: u8) -> Self { + match c { + 0 => PciCapabilityId::ListId, + 0x01 => PciCapabilityId::PowerManagement, + 0x02 => PciCapabilityId::AcceleratedGraphicsPort, + 0x03 => PciCapabilityId::VitalProductData, + 0x04 => PciCapabilityId::SlotIdentification, + 0x05 => PciCapabilityId::MessageSignalledInterrupts, + 0x06 => PciCapabilityId::CompactPciHotSwap, + 0x07 => PciCapabilityId::PciX, + 0x08 => PciCapabilityId::HyperTransport, + 0x09 => PciCapabilityId::VendorSpecific, + 0x0A => PciCapabilityId::Debugport, + 0x0B => PciCapabilityId::CompactPciCentralResourceControl, + 0x0C => PciCapabilityId::PciStandardHotPlugController, + 0x0D => PciCapabilityId::BridgeSubsystemVendorDeviceId, + 0x0E => PciCapabilityId::AgpTargetPciPcibridge, + 0x0F => PciCapabilityId::SecureDevice, + 0x10 => PciCapabilityId::PciExpress, + 0x11 => PciCapabilityId::MsiX, + 0x12 => PciCapabilityId::SataDataIndexConf, + 0x13 => PciCapabilityId::PciAdvancedFeatures, + 0x14 => PciCapabilityId::PciEnhancedAllocation, + _ => PciCapabilityId::ListId, + } + } +} + +/// Types of PCI Express capabilities. +#[derive(PartialEq, Eq, Copy, Clone, Debug)] +#[allow(dead_code)] +#[repr(u16)] +pub enum PciExpressCapabilityId { + NullCapability = 0x0000, + AdvancedErrorReporting = 0x0001, + VirtualChannelMultiFunctionVirtualChannelNotPresent = 0x0002, + DeviceSerialNumber = 0x0003, + PowerBudgeting = 0x0004, + RootComplexLinkDeclaration = 0x0005, + RootComplexInternalLinkControl = 0x0006, + RootComplexEventCollectorEndpointAssociation = 0x0007, + MultiFunctionVirtualChannel = 0x0008, + VirtualChannelMultiFunctionVirtualChannelPresent = 0x0009, + RootComplexRegisterBlock = 0x000a, + VendorSpecificExtendedCapability = 0x000b, + ConfigurationAccessCorrelation = 0x000c, + AccessControlServices = 0x000d, + AlternativeRoutingIdentificationInterpretation = 0x000e, + AddressTranslationServices = 0x000f, + SingleRootIoVirtualization = 0x0010, + DeprecatedMultiRootIoVirtualization = 0x0011, + Multicast = 0x0012, + PageRequestInterface = 0x0013, + ReservedForAmd = 0x0014, + ResizeableBar = 0x0015, + DynamicPowerAllocation = 0x0016, + ThpRequester = 0x0017, + LatencyToleranceReporting = 0x0018, + SecondaryPciExpress = 0x0019, + ProtocolMultiplexing = 0x001a, + ProcessAddressSpaceId = 0x001b, + LnRequester = 0x001c, + DownstreamPortContainment = 0x001d, + L1PmSubstates = 0x001e, + PrecisionTimeMeasurement = 0x001f, + PciExpressOverMphy = 0x0020, + FRSQueueing = 0x0021, + ReadinessTimeReporting = 0x0022, + DesignatedVendorSpecificExtendedCapability = 0x0023, + VfResizeableBar = 0x0024, + DataLinkFeature = 0x0025, + PhysicalLayerSixteenGts = 0x0026, + LaneMarginingAtTheReceiver = 0x0027, + HierarchyId = 0x0028, + NativePcieEnclosureManagement = 0x0029, + PhysicalLayerThirtyTwoGts = 0x002a, + AlternateProtocol = 0x002b, + SystemFirmwareIntermediary = 0x002c, + ShadowFunctions = 0x002d, + DataObjectExchange = 0x002e, + Reserved = 0x002f, + ExtendedCapabilitiesAbsence = 0xffff, +} + +impl From for PciExpressCapabilityId { + fn from(c: u16) -> Self { + match c { + 0x0000 => PciExpressCapabilityId::NullCapability, + 0x0001 => PciExpressCapabilityId::AdvancedErrorReporting, + 0x0002 => PciExpressCapabilityId::VirtualChannelMultiFunctionVirtualChannelNotPresent, + 0x0003 => PciExpressCapabilityId::DeviceSerialNumber, + 0x0004 => PciExpressCapabilityId::PowerBudgeting, + 0x0005 => PciExpressCapabilityId::RootComplexLinkDeclaration, + 0x0006 => PciExpressCapabilityId::RootComplexInternalLinkControl, + 0x0007 => PciExpressCapabilityId::RootComplexEventCollectorEndpointAssociation, + 0x0008 => PciExpressCapabilityId::MultiFunctionVirtualChannel, + 0x0009 => PciExpressCapabilityId::VirtualChannelMultiFunctionVirtualChannelPresent, + 0x000a => PciExpressCapabilityId::RootComplexRegisterBlock, + 0x000b => PciExpressCapabilityId::VendorSpecificExtendedCapability, + 0x000c => PciExpressCapabilityId::ConfigurationAccessCorrelation, + 0x000d => PciExpressCapabilityId::AccessControlServices, + 0x000e => PciExpressCapabilityId::AlternativeRoutingIdentificationInterpretation, + 0x000f => PciExpressCapabilityId::AddressTranslationServices, + 0x0010 => PciExpressCapabilityId::SingleRootIoVirtualization, + 0x0011 => PciExpressCapabilityId::DeprecatedMultiRootIoVirtualization, + 0x0012 => PciExpressCapabilityId::Multicast, + 0x0013 => PciExpressCapabilityId::PageRequestInterface, + 0x0014 => PciExpressCapabilityId::ReservedForAmd, + 0x0015 => PciExpressCapabilityId::ResizeableBar, + 0x0016 => PciExpressCapabilityId::DynamicPowerAllocation, + 0x0017 => PciExpressCapabilityId::ThpRequester, + 0x0018 => PciExpressCapabilityId::LatencyToleranceReporting, + 0x0019 => PciExpressCapabilityId::SecondaryPciExpress, + 0x001a => PciExpressCapabilityId::ProtocolMultiplexing, + 0x001b => PciExpressCapabilityId::ProcessAddressSpaceId, + 0x001c => PciExpressCapabilityId::LnRequester, + 0x001d => PciExpressCapabilityId::DownstreamPortContainment, + 0x001e => PciExpressCapabilityId::L1PmSubstates, + 0x001f => PciExpressCapabilityId::PrecisionTimeMeasurement, + 0x0020 => PciExpressCapabilityId::PciExpressOverMphy, + 0x0021 => PciExpressCapabilityId::FRSQueueing, + 0x0022 => PciExpressCapabilityId::ReadinessTimeReporting, + 0x0023 => PciExpressCapabilityId::DesignatedVendorSpecificExtendedCapability, + 0x0024 => PciExpressCapabilityId::VfResizeableBar, + 0x0025 => PciExpressCapabilityId::DataLinkFeature, + 0x0026 => PciExpressCapabilityId::PhysicalLayerSixteenGts, + 0x0027 => PciExpressCapabilityId::LaneMarginingAtTheReceiver, + 0x0028 => PciExpressCapabilityId::HierarchyId, + 0x0029 => PciExpressCapabilityId::NativePcieEnclosureManagement, + 0x002a => PciExpressCapabilityId::PhysicalLayerThirtyTwoGts, + 0x002b => PciExpressCapabilityId::AlternateProtocol, + 0x002c => PciExpressCapabilityId::SystemFirmwareIntermediary, + 0x002d => PciExpressCapabilityId::ShadowFunctions, + 0x002e => PciExpressCapabilityId::DataObjectExchange, + 0xffff => PciExpressCapabilityId::ExtendedCapabilitiesAbsence, + _ => PciExpressCapabilityId::Reserved, + } + } +} + +/// A PCI capability list. Devices can optionally specify capabilities in their configuration space. +pub trait PciCapability { + fn bytes(&self) -> &[u8]; + fn id(&self) -> PciCapabilityId; +} + +fn encode_32_bits_bar_size(bar_size: u32) -> Option { + if bar_size > 0 { + return Some(!(bar_size - 1)); + } + None +} + +fn decode_32_bits_bar_size(bar_size: u32) -> Option { + if bar_size > 0 { + return Some(!bar_size + 1); + } + None +} + +fn encode_64_bits_bar_size(bar_size: u64) -> Option<(u32, u32)> { + if bar_size > 0 { + let result = !(bar_size - 1); + let result_hi = (result >> 32) as u32; + let result_lo = (result & 0xffff_ffff) as u32; + return Some((result_hi, result_lo)); + } + None +} + +fn decode_64_bits_bar_size(bar_size_hi: u32, bar_size_lo: u32) -> Option { + let bar_size: u64 = ((bar_size_hi as u64) << 32) | (bar_size_lo as u64); + if bar_size > 0 { + return Some(!bar_size + 1); + } + None +} + +#[derive(Debug, Default, Clone, Copy, Serialize, Deserialize)] +struct PciBar { + addr: u32, + size: u32, + used: bool, + r#type: Option, +} + +#[derive(Serialize, Deserialize)] +pub struct PciConfigurationState { + registers: Vec, + writable_bits: Vec, + bars: Vec, + rom_bar_addr: u32, + rom_bar_size: u32, + rom_bar_used: bool, + last_capability: Option<(usize, usize)>, + msix_cap_reg_idx: Option, +} + +/// Contains the configuration space of a PCI node. +/// +/// See the [specification](https://en.wikipedia.org/wiki/PCI_configuration_space). +/// The configuration space is accessed with DWORD reads and writes from the guest. +pub struct PciConfiguration { + registers: [u32; NUM_CONFIGURATION_REGISTERS], + writable_bits: [u32; NUM_CONFIGURATION_REGISTERS], // writable bits for each register. + bars: [PciBar; NUM_BAR_REGS], + rom_bar_addr: u32, + rom_bar_size: u32, + rom_bar_used: bool, + // Contains the byte offset and size of the last capability. + last_capability: Option<(usize, usize)>, + msix_cap_reg_idx: Option, + msix_config: Option>>, +} + +/// See pci_regs.h in kernel +#[derive(Copy, Clone, PartialEq, Eq, Serialize, Deserialize, Debug)] +pub enum PciBarRegionType { + Memory32BitRegion = 0, + IoRegion = 0x01, + Memory64BitRegion = 0x04, +} + +impl From for PciBarRegionType { + fn from(type_: PciBarType) -> Self { + match type_ { + PciBarType::Io => PciBarRegionType::IoRegion, + PciBarType::Mmio32 => PciBarRegionType::Memory32BitRegion, + PciBarType::Mmio64 => PciBarRegionType::Memory64BitRegion, + } + } +} + +impl From for PciBarType { + fn from(val: PciBarRegionType) -> Self { + match val { + PciBarRegionType::IoRegion => PciBarType::Io, + PciBarRegionType::Memory32BitRegion => PciBarType::Mmio32, + PciBarRegionType::Memory64BitRegion => PciBarType::Mmio64, + } + } +} + +#[derive(Copy, Clone)] +pub enum PciBarPrefetchable { + NotPrefetchable = 0, + Prefetchable = 0x08, +} + +impl From for bool { + fn from(val: PciBarPrefetchable) -> Self { + match val { + PciBarPrefetchable::NotPrefetchable => false, + PciBarPrefetchable::Prefetchable => true, + } + } +} + +#[derive(Copy, Clone)] +pub struct PciBarConfiguration { + addr: u64, + size: u64, + idx: usize, + region_type: PciBarRegionType, + prefetchable: PciBarPrefetchable, +} + +#[derive(Debug)] +pub enum Error { + BarAddressInvalid(u64, u64), + BarInUse(usize), + BarInUse64(usize), + BarInvalid(usize), + BarInvalid64(usize), + BarSizeInvalid(u64), + CapabilityEmpty, + CapabilityLengthInvalid(usize), + CapabilitySpaceFull(usize), + Decode32BarSize, + Decode64BarSize, + Encode32BarSize, + Encode64BarSize, + RomBarAddressInvalid(u64, u64), + RomBarInUse(usize), + RomBarInvalid(usize), + RomBarSizeInvalid(u64), +} +pub type Result = std::result::Result; + +impl std::error::Error for Error {} + +impl Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + use self::Error::*; + match self { + BarAddressInvalid(a, s) => write!(f, "address {a} size {s} too big"), + BarInUse(b) => write!(f, "bar {b} already used"), + BarInUse64(b) => write!(f, "64bit bar {b} already used(requires two regs)"), + BarInvalid(b) => write!(f, "bar {} invalid, max {}", b, NUM_BAR_REGS - 1), + BarInvalid64(b) => write!( + f, + "64bitbar {} invalid, requires two regs, max {}", + b, + NUM_BAR_REGS - 1 + ), + BarSizeInvalid(s) => write!(f, "bar address {s} not a power of two"), + CapabilityEmpty => write!(f, "empty capabilities are invalid"), + CapabilityLengthInvalid(l) => write!(f, "Invalid capability length {l}"), + CapabilitySpaceFull(s) => write!(f, "capability of size {s} doesn't fit"), + Decode32BarSize => write!(f, "failed to decode 32 bits BAR size"), + Decode64BarSize => write!(f, "failed to decode 64 bits BAR size"), + Encode32BarSize => write!(f, "failed to encode 32 bits BAR size"), + Encode64BarSize => write!(f, "failed to encode 64 bits BAR size"), + RomBarAddressInvalid(a, s) => write!(f, "address {a} size {s} too big"), + RomBarInUse(b) => write!(f, "rom bar {b} already used"), + RomBarInvalid(b) => write!(f, "rom bar {} invalid, max {}", b, NUM_BAR_REGS - 1), + RomBarSizeInvalid(s) => write!(f, "rom bar address {s} not a power of two"), + } + } +} + +impl PciConfiguration { + #[allow(clippy::too_many_arguments)] + pub fn new( + vendor_id: u16, + device_id: u16, + revision_id: u8, + class_code: PciClassCode, + subclass: &dyn PciSubclass, + programming_interface: Option<&dyn PciProgrammingInterface>, + header_type: PciHeaderType, + subsystem_vendor_id: u16, + subsystem_id: u16, + msix_config: Option>>, + state: Option, + ) -> Self { + let ( + registers, + writable_bits, + bars, + rom_bar_addr, + rom_bar_size, + rom_bar_used, + last_capability, + msix_cap_reg_idx, + ) = if let Some(state) = state { + ( + state.registers.try_into().unwrap(), + state.writable_bits.try_into().unwrap(), + state.bars.try_into().unwrap(), + state.rom_bar_addr, + state.rom_bar_size, + state.rom_bar_used, + state.last_capability, + state.msix_cap_reg_idx, + ) + } else { + let mut registers = [0u32; NUM_CONFIGURATION_REGISTERS]; + let mut writable_bits = [0u32; NUM_CONFIGURATION_REGISTERS]; + registers[0] = (u32::from(device_id) << 16) | u32::from(vendor_id); + // TODO(dverkamp): Status should be write-1-to-clear + writable_bits[1] = 0x0000_ffff; // Status (r/o), command (r/w) + let pi = if let Some(pi) = programming_interface { + pi.get_register_value() + } else { + 0 + }; + registers[2] = (u32::from(class_code.get_register_value()) << 24) + | (u32::from(subclass.get_register_value()) << 16) + | (u32::from(pi) << 8) + | u32::from(revision_id); + writable_bits[3] = 0x0000_00ff; // Cacheline size (r/w) + match header_type { + PciHeaderType::Device => { + registers[3] = 0x0000_0000; // Header type 0 (device) + writable_bits[15] = 0x0000_00ff; // Interrupt line (r/w) + } + PciHeaderType::Bridge => { + registers[3] = 0x0001_0000; // Header type 1 (bridge) + writable_bits[9] = 0xfff0_fff0; // Memory base and limit + writable_bits[15] = 0xffff_00ff; // Bridge control (r/w), interrupt line (r/w) + } + }; + registers[11] = (u32::from(subsystem_id) << 16) | u32::from(subsystem_vendor_id); + + ( + registers, + writable_bits, + [PciBar::default(); NUM_BAR_REGS], + 0, + 0, + false, + None, + None, + ) + }; + + PciConfiguration { + registers, + writable_bits, + bars, + rom_bar_addr, + rom_bar_size, + rom_bar_used, + last_capability, + msix_cap_reg_idx, + msix_config, + } + } + + pub fn state(&self) -> PciConfigurationState { + PciConfigurationState { + registers: self.registers.to_vec(), + writable_bits: self.writable_bits.to_vec(), + bars: self.bars.to_vec(), + rom_bar_addr: self.rom_bar_addr, + rom_bar_size: self.rom_bar_size, + rom_bar_used: self.rom_bar_used, + last_capability: self.last_capability, + msix_cap_reg_idx: self.msix_cap_reg_idx, + } + } + + /// Reads a 32bit register from `reg_idx` in the register map. + pub fn read_reg(&self, reg_idx: usize) -> u32 { + *(self.registers.get(reg_idx).unwrap_or(&0xffff_ffff)) + } + + /// Writes a 32bit register to `reg_idx` in the register map. + pub fn write_reg(&mut self, reg_idx: usize, value: u32) { + let mut mask = self.writable_bits[reg_idx]; + + if (BAR0_REG..BAR0_REG + NUM_BAR_REGS).contains(®_idx) { + // Handle very specific case where the BAR is being written with + // all 1's to retrieve the BAR size during next BAR reading. + if value == 0xffff_ffff { + mask &= self.bars[reg_idx - 4].size; + } + } else if reg_idx == ROM_BAR_REG { + // Handle very specific case where the BAR is being written with + // all 1's on bits 31-11 to retrieve the BAR size during next BAR + // reading. + if value & ROM_BAR_ADDR_MASK == ROM_BAR_ADDR_MASK { + mask &= self.rom_bar_size; + } + } + + if let Some(r) = self.registers.get_mut(reg_idx) { + *r = (*r & !self.writable_bits[reg_idx]) | (value & mask); + } else { + warn!("bad PCI register write {}", reg_idx); + } + } + + /// Writes a 16bit word to `offset`. `offset` must be 16bit aligned. + pub fn write_word(&mut self, offset: usize, value: u16) { + let shift = match offset % 4 { + 0 => 0, + 2 => 16, + _ => { + warn!("bad PCI config write offset {}", offset); + return; + } + }; + let reg_idx = offset / 4; + + if let Some(r) = self.registers.get_mut(reg_idx) { + let writable_mask = self.writable_bits[reg_idx]; + let mask = (0xffffu32 << shift) & writable_mask; + let shifted_value = (u32::from(value) << shift) & writable_mask; + *r = *r & !mask | shifted_value; + } else { + warn!("bad PCI config write offset {}", offset); + } + } + + /// Writes a byte to `offset`. + pub fn write_byte(&mut self, offset: usize, value: u8) { + self.write_byte_internal(offset, value, true); + } + + /// Writes a byte to `offset`, optionally enforcing read-only bits. + fn write_byte_internal(&mut self, offset: usize, value: u8, apply_writable_mask: bool) { + let shift = (offset % 4) * 8; + let reg_idx = offset / 4; + + if let Some(r) = self.registers.get_mut(reg_idx) { + let writable_mask = if apply_writable_mask { + self.writable_bits[reg_idx] + } else { + 0xffff_ffff + }; + let mask = (0xffu32 << shift) & writable_mask; + let shifted_value = (u32::from(value) << shift) & writable_mask; + *r = *r & !mask | shifted_value; + } else { + warn!("bad PCI config write offset {}", offset); + } + } + + /// Adds a region specified by `config`. Configures the specified BAR(s) to + /// report this region and size to the guest kernel. Enforces a few constraints + /// (i.e, region size must be power of two, register not already used). + pub fn add_pci_bar(&mut self, config: &PciBarConfiguration) -> Result<()> { + let bar_idx = config.idx; + let reg_idx = BAR0_REG + bar_idx; + + if self.bars[bar_idx].used { + return Err(Error::BarInUse(bar_idx)); + } + + if !config.size.is_power_of_two() { + return Err(Error::BarSizeInvalid(config.size)); + } + + if bar_idx >= NUM_BAR_REGS { + return Err(Error::BarInvalid(bar_idx)); + } + + let end_addr = config + .addr + .checked_add(config.size - 1) + .ok_or(Error::BarAddressInvalid(config.addr, config.size))?; + match config.region_type { + PciBarRegionType::Memory32BitRegion | PciBarRegionType::IoRegion => { + if end_addr > u64::from(u32::MAX) { + return Err(Error::BarAddressInvalid(config.addr, config.size)); + } + + // Encode the BAR size as expected by the software running in + // the guest. + self.bars[bar_idx].size = + encode_32_bits_bar_size(config.size as u32).ok_or(Error::Encode32BarSize)?; + } + PciBarRegionType::Memory64BitRegion => { + if bar_idx + 1 >= NUM_BAR_REGS { + return Err(Error::BarInvalid64(bar_idx)); + } + + if self.bars[bar_idx + 1].used { + return Err(Error::BarInUse64(bar_idx)); + } + + // Encode the BAR size as expected by the software running in + // the guest. + let (bar_size_hi, bar_size_lo) = + encode_64_bits_bar_size(config.size).ok_or(Error::Encode64BarSize)?; + + self.registers[reg_idx + 1] = (config.addr >> 32) as u32; + self.writable_bits[reg_idx + 1] = 0xffff_ffff; + self.bars[bar_idx + 1].addr = self.registers[reg_idx + 1]; + self.bars[bar_idx].size = bar_size_lo; + self.bars[bar_idx + 1].size = bar_size_hi; + self.bars[bar_idx + 1].used = true; + } + } + + let (mask, lower_bits) = match config.region_type { + PciBarRegionType::Memory32BitRegion | PciBarRegionType::Memory64BitRegion => ( + BAR_MEM_ADDR_MASK, + config.prefetchable as u32 | config.region_type as u32, + ), + PciBarRegionType::IoRegion => (BAR_IO_ADDR_MASK, config.region_type as u32), + }; + + self.registers[reg_idx] = ((config.addr as u32) & mask) | lower_bits; + self.writable_bits[reg_idx] = mask; + self.bars[bar_idx].addr = self.registers[reg_idx]; + self.bars[bar_idx].used = true; + self.bars[bar_idx].r#type = Some(config.region_type); + + Ok(()) + } + + /// Adds rom expansion BAR. + pub fn add_pci_rom_bar(&mut self, config: &PciBarConfiguration, active: u32) -> Result<()> { + let bar_idx = config.idx; + let reg_idx = ROM_BAR_REG; + + if self.rom_bar_used { + return Err(Error::RomBarInUse(bar_idx)); + } + + if !config.size.is_power_of_two() { + return Err(Error::RomBarSizeInvalid(config.size)); + } + + if bar_idx != ROM_BAR_IDX { + return Err(Error::RomBarInvalid(bar_idx)); + } + + let end_addr = config + .addr + .checked_add(config.size - 1) + .ok_or(Error::RomBarAddressInvalid(config.addr, config.size))?; + + if end_addr > u64::from(u32::MAX) { + return Err(Error::RomBarAddressInvalid(config.addr, config.size)); + } + + self.registers[reg_idx] = (config.addr as u32) | active; + self.writable_bits[reg_idx] = ROM_BAR_ADDR_MASK; + self.rom_bar_addr = self.registers[reg_idx]; + self.rom_bar_size = + encode_32_bits_bar_size(config.size as u32).ok_or(Error::Encode32BarSize)?; + self.rom_bar_used = true; + + Ok(()) + } + + /// Returns the address of the given BAR region. + pub fn get_bar_addr(&self, bar_num: usize) -> u64 { + let bar_idx = BAR0_REG + bar_num; + + let mut addr = u64::from(self.bars[bar_num].addr & self.writable_bits[bar_idx]); + + if let Some(bar_type) = self.bars[bar_num].r#type { + if bar_type == PciBarRegionType::Memory64BitRegion { + addr |= u64::from(self.bars[bar_num + 1].addr) << 32; + } + } + + addr + } + + /// Configures the IRQ line and pin used by this device. + pub fn set_irq(&mut self, line: u8, pin: PciInterruptPin) { + // `pin` is 1-based in the pci config space. + let pin_idx = (pin as u32) + 1; + self.registers[INTERRUPT_LINE_PIN_REG] = (self.registers[INTERRUPT_LINE_PIN_REG] + & 0xffff_0000) + | (pin_idx << 8) + | u32::from(line); + } + + /// Adds the capability `cap_data` to the list of capabilities. + /// `cap_data` should include the two-byte PCI capability header (type, next), + /// but not populate it. Correct values will be generated automatically based + /// on `cap_data.id()`. + pub fn add_capability(&mut self, cap_data: &dyn PciCapability) -> Result { + let total_len = cap_data.bytes().len(); + // Check that the length is valid. + if cap_data.bytes().is_empty() { + return Err(Error::CapabilityEmpty); + } + let (cap_offset, tail_offset) = match self.last_capability { + Some((offset, len)) => (Self::next_dword(offset, len), offset + 1), + None => (FIRST_CAPABILITY_OFFSET, CAPABILITY_LIST_HEAD_OFFSET), + }; + let end_offset = cap_offset + .checked_add(total_len) + .ok_or(Error::CapabilitySpaceFull(total_len))?; + if end_offset > CAPABILITY_MAX_OFFSET { + return Err(Error::CapabilitySpaceFull(total_len)); + } + self.registers[STATUS_REG] |= STATUS_REG_CAPABILITIES_USED_MASK; + self.write_byte_internal(tail_offset, cap_offset as u8, false); + self.write_byte_internal(cap_offset, cap_data.id() as u8, false); + self.write_byte_internal(cap_offset + 1, 0, false); // Next pointer. + for (i, byte) in cap_data.bytes().iter().enumerate() { + self.write_byte_internal(cap_offset + i + 2, *byte, false); + } + self.last_capability = Some((cap_offset, total_len)); + + match cap_data.id() { + PciCapabilityId::MessageSignalledInterrupts => { + self.writable_bits[cap_offset / 4] = MSI_CAPABILITY_REGISTER_MASK; + } + PciCapabilityId::MsiX => { + self.msix_cap_reg_idx = Some(cap_offset / 4); + self.writable_bits[self.msix_cap_reg_idx.unwrap()] = MSIX_CAPABILITY_REGISTER_MASK; + } + _ => {} + } + + Ok(cap_offset) + } + + // Find the next aligned offset after the one given. + fn next_dword(offset: usize, len: usize) -> usize { + let next = offset + len; + (next + 3) & !3 + } + + pub fn write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8]) { + if offset as usize + data.len() > 4 { + return; + } + + // Handle potential write to MSI-X message control register + if let Some(msix_cap_reg_idx) = self.msix_cap_reg_idx { + if let Some(msix_config) = &self.msix_config { + if msix_cap_reg_idx == reg_idx && offset == 2 && data.len() == 2 { + msix_config + .lock() + .unwrap() + .set_msg_ctl(LittleEndian::read_u16(data)); + } else if msix_cap_reg_idx == reg_idx && offset == 0 && data.len() == 4 { + msix_config + .lock() + .unwrap() + .set_msg_ctl((LittleEndian::read_u32(data) >> 16) as u16); + } + } + } + + match data.len() { + 1 => self.write_byte(reg_idx * 4 + offset as usize, data[0]), + 2 => self.write_word( + reg_idx * 4 + offset as usize, + u16::from(data[0]) | (u16::from(data[1]) << 8), + ), + 4 => self.write_reg(reg_idx, LittleEndian::read_u32(data)), + _ => (), + } + } + + pub fn read_config_register(&self, reg_idx: usize) -> u32 { + self.read_reg(reg_idx) + } + + pub fn detect_bar_reprogramming( + &mut self, + reg_idx: usize, + data: &[u8], + ) -> Option { + if data.len() != 4 { + return None; + } + + let value = LittleEndian::read_u32(data); + + let mask = self.writable_bits[reg_idx]; + if (BAR0_REG..BAR0_REG + NUM_BAR_REGS).contains(®_idx) { + // Ignore the case where the BAR size is being asked for. + if value == 0xffff_ffff { + return None; + } + + let bar_idx = reg_idx - 4; + // Handle special case where the address being written is + // different from the address initially provided. This is a + // BAR reprogramming case which needs to be properly caught. + if let Some(bar_type) = self.bars[bar_idx].r#type { + // In case of 64 bits memory BAR, we don't do anything until + // the upper BAR is modified, otherwise we would be moving the + // BAR to a wrong location in memory. + if bar_type == PciBarRegionType::Memory64BitRegion { + return None; + } + + // Ignore the case where the value is unchanged. + if (value & mask) == (self.bars[bar_idx].addr & mask) { + return None; + } + + info!( + "Detected BAR reprogramming: (BAR {}) 0x{:x}->0x{:x}", + reg_idx, self.registers[reg_idx], value + ); + let old_base = u64::from(self.bars[bar_idx].addr & mask); + let new_base = u64::from(value & mask); + let len = u64::from( + decode_32_bits_bar_size(self.bars[bar_idx].size) + .ok_or(Error::Decode32BarSize) + .unwrap(), + ); + let region_type = bar_type; + + self.bars[bar_idx].addr = value; + + return Some(BarReprogrammingParams { + old_base, + new_base, + len, + region_type, + }); + } else if (reg_idx > BAR0_REG) + && ((self.registers[reg_idx - 1] & self.writable_bits[reg_idx - 1]) + != (self.bars[bar_idx - 1].addr & self.writable_bits[reg_idx - 1]) + || (value & mask) != (self.bars[bar_idx].addr & mask)) + { + info!( + "Detected BAR reprogramming: (BAR {}) 0x{:x}->0x{:x}", + reg_idx, self.registers[reg_idx], value + ); + let old_base = (u64::from(self.bars[bar_idx].addr & mask) << 32) + | u64::from(self.bars[bar_idx - 1].addr & self.writable_bits[reg_idx - 1]); + let new_base = (u64::from(value & mask) << 32) + | u64::from(self.registers[reg_idx - 1] & self.writable_bits[reg_idx - 1]); + let len = + decode_64_bits_bar_size(self.bars[bar_idx].size, self.bars[bar_idx - 1].size) + .ok_or(Error::Decode64BarSize) + .unwrap(); + let region_type = PciBarRegionType::Memory64BitRegion; + + self.bars[bar_idx].addr = value; + self.bars[bar_idx - 1].addr = self.registers[reg_idx - 1]; + + return Some(BarReprogrammingParams { + old_base, + new_base, + len, + region_type, + }); + } + } else if reg_idx == ROM_BAR_REG && (value & mask) != (self.rom_bar_addr & mask) { + // Ignore the case where the BAR size is being asked for. + if value & ROM_BAR_ADDR_MASK == ROM_BAR_ADDR_MASK { + return None; + } + + info!( + "Detected ROM BAR reprogramming: (BAR {}) 0x{:x}->0x{:x}", + reg_idx, self.registers[reg_idx], value + ); + let old_base = u64::from(self.rom_bar_addr & mask); + let new_base = u64::from(value & mask); + let len = u64::from( + decode_32_bits_bar_size(self.rom_bar_size) + .ok_or(Error::Decode32BarSize) + .unwrap(), + ); + let region_type = PciBarRegionType::Memory32BitRegion; + + self.rom_bar_addr = value; + + return Some(BarReprogrammingParams { + old_base, + new_base, + len, + region_type, + }); + } + + None + } +} + +impl Default for PciBarConfiguration { + fn default() -> Self { + PciBarConfiguration { + idx: 0, + addr: 0, + size: 0, + region_type: PciBarRegionType::Memory64BitRegion, + prefetchable: PciBarPrefetchable::NotPrefetchable, + } + } +} + +impl PciBarConfiguration { + pub fn new( + idx: usize, + size: u64, + region_type: PciBarRegionType, + prefetchable: PciBarPrefetchable, + ) -> Self { + PciBarConfiguration { + idx, + addr: 0, + size, + region_type, + prefetchable, + } + } + + #[must_use] + pub fn set_index(mut self, idx: usize) -> Self { + self.idx = idx; + self + } + + #[must_use] + pub fn set_address(mut self, addr: u64) -> Self { + self.addr = addr; + self + } + + #[must_use] + pub fn set_size(mut self, size: u64) -> Self { + self.size = size; + self + } + + #[must_use] + pub fn set_region_type(mut self, region_type: PciBarRegionType) -> Self { + self.region_type = region_type; + self + } + + #[must_use] + pub fn set_prefetchable(mut self, prefetchable: PciBarPrefetchable) -> Self { + self.prefetchable = prefetchable; + self + } + + pub fn idx(&self) -> usize { + self.idx + } + + pub fn addr(&self) -> u64 { + self.addr + } + + pub fn size(&self) -> u64 { + self.size + } + + pub fn region_type(&self) -> PciBarRegionType { + self.region_type + } + + pub fn prefetchable(&self) -> PciBarPrefetchable { + self.prefetchable + } +} + +#[cfg(test)] +mod tests { + use vm_memory::ByteValued; + + use super::*; + + #[repr(C, packed)] + #[derive(Clone, Copy, Default)] + #[allow(dead_code)] + struct TestCap { + len: u8, + foo: u8, + } + + // SAFETY: All members are simple numbers and any value is valid. + unsafe impl ByteValued for TestCap {} + + impl PciCapability for TestCap { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::VendorSpecific + } + } + + #[test] + fn add_capability() { + let mut cfg = PciConfiguration::new( + 0x1234, + 0x5678, + 0x1, + PciClassCode::MultimediaController, + &PciMultimediaSubclass::AudioController, + None, + PciHeaderType::Device, + 0xABCD, + 0x2468, + None, + None, + ); + + // Add two capabilities with different contents. + let cap1 = TestCap { len: 4, foo: 0xAA }; + let cap1_offset = cfg.add_capability(&cap1).unwrap(); + assert_eq!(cap1_offset % 4, 0); + + let cap2 = TestCap { + len: 0x04, + foo: 0x55, + }; + let cap2_offset = cfg.add_capability(&cap2).unwrap(); + assert_eq!(cap2_offset % 4, 0); + + // The capability list head should be pointing to cap1. + let cap_ptr = cfg.read_reg(CAPABILITY_LIST_HEAD_OFFSET / 4) & 0xFF; + assert_eq!(cap1_offset, cap_ptr as usize); + + // Verify the contents of the capabilities. + let cap1_data = cfg.read_reg(cap1_offset / 4); + assert_eq!(cap1_data & 0xFF, 0x09); // capability ID + assert_eq!((cap1_data >> 8) & 0xFF, cap2_offset as u32); // next capability pointer + assert_eq!((cap1_data >> 16) & 0xFF, 0x04); // cap1.len + assert_eq!((cap1_data >> 24) & 0xFF, 0xAA); // cap1.foo + + let cap2_data = cfg.read_reg(cap2_offset / 4); + assert_eq!(cap2_data & 0xFF, 0x09); // capability ID + assert_eq!((cap2_data >> 8) & 0xFF, 0x00); // next capability pointer + assert_eq!((cap2_data >> 16) & 0xFF, 0x04); // cap2.len + assert_eq!((cap2_data >> 24) & 0xFF, 0x55); // cap2.foo + } + + #[derive(Copy, Clone)] + enum TestPi { + Test = 0x5a, + } + + impl PciProgrammingInterface for TestPi { + fn get_register_value(&self) -> u8 { + *self as u8 + } + } + + #[test] + fn class_code() { + let cfg = PciConfiguration::new( + 0x1234, + 0x5678, + 0x1, + PciClassCode::MultimediaController, + &PciMultimediaSubclass::AudioController, + Some(&TestPi::Test), + PciHeaderType::Device, + 0xABCD, + 0x2468, + None, + None, + ); + + let class_reg = cfg.read_reg(2); + let class_code = (class_reg >> 24) & 0xFF; + let subclass = (class_reg >> 16) & 0xFF; + let prog_if = (class_reg >> 8) & 0xFF; + assert_eq!(class_code, 0x04); + assert_eq!(subclass, 0x01); + assert_eq!(prog_if, 0x5a); + } +} diff --git a/src/pci/src/device.rs b/src/pci/src/device.rs new file mode 100644 index 00000000000..d3bd3056a36 --- /dev/null +++ b/src/pci/src/device.rs @@ -0,0 +1,136 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +use std::any::Any; +use std::fmt::{self, Display}; +use std::sync::{Arc, Barrier}; +use std::{io, result}; + +use vm_allocator::AddressAllocator; +use vm_device::Resource; + +use crate::configuration::{self, PciBarRegionType}; +use crate::PciBarConfiguration; + +#[derive(Debug)] +pub enum Error { + /// Setup of the device capabilities failed. + CapabilitiesSetup(configuration::Error), + /// Allocating space for an IO BAR failed. + IoAllocationFailed(u64), + /// Registering an IO BAR failed. + IoRegistrationFailed(u64, configuration::Error), + /// Expected resource not found. + MissingResource, + /// Invalid resource. + InvalidResource(Resource), +} +pub type Result = std::result::Result; + +impl Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + use self::Error::*; + + match self { + CapabilitiesSetup(e) => write!(f, "failed to add capability {e}"), + IoAllocationFailed(size) => { + write!(f, "failed to allocate space for an IO BAR, size={size}") + } + IoRegistrationFailed(addr, e) => { + write!(f, "failed to register an IO BAR, addr={addr} err={e}") + } + MissingResource => write!(f, "failed to find expected resource"), + InvalidResource(r) => write!(f, "invalid resource {r:?}"), + } + } +} + +#[derive(Clone, Copy)] +pub struct BarReprogrammingParams { + pub old_base: u64, + pub new_base: u64, + pub len: u64, + pub region_type: PciBarRegionType, +} + +pub trait PciDevice: Send { + /// Allocates the needed PCI BARs space using the `allocate` function which takes a size and + /// returns an address. Returns a Vec of (GuestAddress, GuestUsize) tuples. + fn allocate_bars( + &mut self, + _mmio32_allocator: &mut AddressAllocator, + _mmio64_allocator: &mut AddressAllocator, + _resources: Option>, + ) -> Result> { + Ok(Vec::new()) + } + + /// Frees the PCI BARs previously allocated with a call to allocate_bars(). + fn free_bars( + &mut self, + _mmio32_allocator: &mut AddressAllocator, + _mmio64_allocator: &mut AddressAllocator, + ) -> Result<()> { + Ok(()) + } + + /// Sets a register in the configuration space. + /// * `reg_idx` - The index of the config register to modify. + /// * `offset` - Offset into the register. + fn write_config_register( + &mut self, + reg_idx: usize, + offset: u64, + data: &[u8], + ) -> Option>; + /// Gets a register from the configuration space. + /// * `reg_idx` - The index of the config register to read. + fn read_config_register(&mut self, reg_idx: usize) -> u32; + /// Detects if a BAR is being reprogrammed. + fn detect_bar_reprogramming( + &mut self, + _reg_idx: usize, + _data: &[u8], + ) -> Option { + None + } + /// Reads from a BAR region mapped into the device. + /// * `addr` - The guest address inside the BAR. + /// * `data` - Filled with the data from `addr`. + fn read_bar(&mut self, _base: u64, _offset: u64, _data: &mut [u8]) {} + /// Writes to a BAR region mapped into the device. + /// * `addr` - The guest address inside the BAR. + /// * `data` - The data to write. + fn write_bar(&mut self, _base: u64, _offset: u64, _data: &[u8]) -> Option> { + None + } + /// Relocates the BAR to a different address in guest address space. + fn move_bar(&mut self, _old_base: u64, _new_base: u64) -> result::Result<(), io::Error> { + Ok(()) + } + /// Provides a mutable reference to the Any trait. This is useful to let + /// the caller have access to the underlying type behind the trait. + fn as_any_mut(&mut self) -> &mut dyn Any; + + /// Optionally returns a unique identifier. + fn id(&self) -> Option; +} + +/// This trait defines a set of functions which can be triggered whenever a +/// PCI device is modified in any way. +pub trait DeviceRelocation: Send + Sync { + /// The BAR needs to be moved to a different location in the guest address + /// space. This follows a decision from the software running in the guest. + fn move_bar( + &self, + old_base: u64, + new_base: u64, + len: u64, + pci_dev: &mut dyn PciDevice, + region_type: PciBarRegionType, + ) -> result::Result<(), io::Error>; +} diff --git a/src/pci/src/lib.rs b/src/pci/src/lib.rs new file mode 100644 index 00000000000..2672159e474 --- /dev/null +++ b/src/pci/src/lib.rs @@ -0,0 +1,198 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +//! Implements pci devices and busses. +#[macro_use] +extern crate log; + +mod bus; +mod configuration; +mod device; +mod msi; +mod msix; + +use std::fmt::{self, Debug, Display}; +use std::num::ParseIntError; +use std::str::FromStr; + +use serde::de::Visitor; + +pub use self::bus::{PciBus, PciConfigIo, PciConfigMmio, PciRoot, PciRootError}; +pub use self::configuration::{ + PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciCapability, PciCapabilityId, + PciClassCode, PciConfiguration, PciExpressCapabilityId, PciHeaderType, PciMassStorageSubclass, + PciNetworkControllerSubclass, PciProgrammingInterface, PciSerialBusSubClass, PciSubclass, + PCI_CONFIGURATION_ID, +}; +pub use self::device::{ + BarReprogrammingParams, DeviceRelocation, Error as PciDeviceError, PciDevice, +}; +pub use self::msi::{msi_num_enabled_vectors, MsiCap, MsiConfig}; +pub use self::msix::{MsixCap, MsixConfig, MsixTableEntry, MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE}; + +/// PCI has four interrupt pins A->D. +#[derive(Copy, Clone)] +pub enum PciInterruptPin { + IntA, + IntB, + IntC, + IntD, +} + +impl PciInterruptPin { + pub fn to_mask(self) -> u32 { + self as u32 + } +} + +#[cfg(target_arch = "x86_64")] +pub const PCI_CONFIG_IO_PORT: u64 = 0xcf8; +#[cfg(target_arch = "x86_64")] +pub const PCI_CONFIG_IO_PORT_SIZE: u64 = 0x8; + +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd)] +pub struct PciBdf(u32); + +struct PciBdfVisitor; + +impl Visitor<'_> for PciBdfVisitor { + type Value = PciBdf; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + formatter.write_str("struct PciBdf") + } + + fn visit_str(self, v: &str) -> Result + where + E: serde::de::Error, + { + Ok(v.into()) + } +} + +impl<'de> serde::Deserialize<'de> for PciBdf { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + deserializer.deserialize_str(PciBdfVisitor) + } +} + +impl serde::Serialize for PciBdf { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + serializer.collect_str(&self.to_string()) + } +} + +impl PciBdf { + pub fn segment(&self) -> u16 { + ((self.0 >> 16) & 0xffff) as u16 + } + + pub fn bus(&self) -> u8 { + ((self.0 >> 8) & 0xff) as u8 + } + + pub fn device(&self) -> u8 { + ((self.0 >> 3) & 0x1f) as u8 + } + + pub fn function(&self) -> u8 { + (self.0 & 0x7) as u8 + } + + pub fn new(segment: u16, bus: u8, device: u8, function: u8) -> Self { + Self( + ((segment as u32) << 16) + | ((bus as u32) << 8) + | (((device & 0x1f) as u32) << 3) + | (function & 0x7) as u32, + ) + } +} + +impl From for PciBdf { + fn from(bdf: u32) -> Self { + Self(bdf) + } +} + +impl From for u32 { + fn from(bdf: PciBdf) -> Self { + bdf.0 + } +} + +impl From<&PciBdf> for u32 { + fn from(bdf: &PciBdf) -> Self { + bdf.0 + } +} + +impl From for u16 { + fn from(bdf: PciBdf) -> Self { + (bdf.0 & 0xffff) as u16 + } +} + +impl From<&PciBdf> for u16 { + fn from(bdf: &PciBdf) -> Self { + (bdf.0 & 0xffff) as u16 + } +} + +impl Debug for PciBdf { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{:04x}:{:02x}:{:02x}.{:01x}", + self.segment(), + self.bus(), + self.device(), + self.function() + ) + } +} + +impl Display for PciBdf { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{:04x}:{:02x}:{:02x}.{:01x}", + self.segment(), + self.bus(), + self.device(), + self.function() + ) + } +} + +impl FromStr for PciBdf { + type Err = ParseIntError; + + fn from_str(s: &str) -> Result { + let items: Vec<&str> = s.split('.').collect(); + assert_eq!(items.len(), 2); + let function = u8::from_str_radix(items[1], 16)?; + let items: Vec<&str> = items[0].split(':').collect(); + assert_eq!(items.len(), 3); + let segment = u16::from_str_radix(items[0], 16)?; + let bus = u8::from_str_radix(items[1], 16)?; + let device = u8::from_str_radix(items[2], 16)?; + Ok(PciBdf::new(segment, bus, device, function)) + } +} + +impl From<&str> for PciBdf { + fn from(bdf: &str) -> Self { + Self::from_str(bdf).unwrap() + } +} diff --git a/src/pci/src/msi.rs b/src/pci/src/msi.rs new file mode 100644 index 00000000000..16d593cd115 --- /dev/null +++ b/src/pci/src/msi.rs @@ -0,0 +1,282 @@ +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause +// + +use std::io; +use std::sync::Arc; + +use byteorder::{ByteOrder, LittleEndian}; +use serde::{Deserialize, Serialize}; +use thiserror::Error; +use vm_device::interrupt::{ + InterruptIndex, InterruptSourceConfig, InterruptSourceGroup, MsiIrqSourceConfig, +}; + +// MSI control masks +const MSI_CTL_ENABLE: u16 = 0x1; +const MSI_CTL_MULTI_MSG_ENABLE: u16 = 0x70; +const MSI_CTL_64_BITS: u16 = 0x80; +const MSI_CTL_PER_VECTOR: u16 = 0x100; + +// MSI message offsets +const MSI_MSG_CTL_OFFSET: u64 = 0x2; +const MSI_MSG_ADDR_LO_OFFSET: u64 = 0x4; + +// MSI message masks +const MSI_MSG_ADDR_LO_MASK: u32 = 0xffff_fffc; + +pub fn msi_num_enabled_vectors(msg_ctl: u16) -> usize { + let field = (msg_ctl >> 4) & 0x7; + + if field > 5 { + return 0; + } + + 1 << field +} + +#[derive(Error, Debug)] +pub enum Error { + #[error("Failed enabling the interrupt route: {0}")] + EnableInterruptRoute(io::Error), + #[error("Failed updating the interrupt route: {0}")] + UpdateInterruptRoute(io::Error), +} + +#[derive(Clone, Copy, Default, Serialize, Deserialize)] +pub struct MsiCap { + // Message Control Register + // 0: MSI enable. + // 3-1; Multiple message capable. + // 6-4: Multiple message enable. + // 7: 64 bits address capable. + // 8: Per-vector masking capable. + // 15-9: Reserved. + pub msg_ctl: u16, + // Message Address (LSB) + // 1-0: Reserved. + // 31-2: Message address. + pub msg_addr_lo: u32, + // Message Upper Address (MSB) + // 31-0: Message address. + pub msg_addr_hi: u32, + // Message Data + // 15-0: Message data. + pub msg_data: u16, + // Mask Bits + // 31-0: Mask bits. + pub mask_bits: u32, + // Pending Bits + // 31-0: Pending bits. + pub pending_bits: u32, +} + +impl MsiCap { + fn addr_64_bits(&self) -> bool { + self.msg_ctl & MSI_CTL_64_BITS == MSI_CTL_64_BITS + } + + fn per_vector_mask(&self) -> bool { + self.msg_ctl & MSI_CTL_PER_VECTOR == MSI_CTL_PER_VECTOR + } + + fn enabled(&self) -> bool { + self.msg_ctl & MSI_CTL_ENABLE == MSI_CTL_ENABLE + } + + fn num_enabled_vectors(&self) -> usize { + msi_num_enabled_vectors(self.msg_ctl) + } + + fn vector_masked(&self, vector: usize) -> bool { + if !self.per_vector_mask() { + return false; + } + + (self.mask_bits >> vector) & 0x1 == 0x1 + } + + fn size(&self) -> u64 { + let mut size: u64 = 0xa; + + if self.addr_64_bits() { + size += 0x4; + } + if self.per_vector_mask() { + size += 0xa; + } + + size + } + + fn update(&mut self, offset: u64, data: &[u8]) { + // Calculate message data offset depending on the address being 32 or + // 64 bits. + // Calculate upper address offset if the address is 64 bits. + // Calculate mask bits offset based on the address being 32 or 64 bits + // and based on the per vector masking being enabled or not. + let (msg_data_offset, addr_hi_offset, mask_bits_offset): (u64, Option, Option) = + if self.addr_64_bits() { + let mask_bits = if self.per_vector_mask() { + Some(0x10) + } else { + None + }; + (0xc, Some(0x8), mask_bits) + } else { + let mask_bits = if self.per_vector_mask() { + Some(0xc) + } else { + None + }; + (0x8, None, mask_bits) + }; + + // Update cache without overriding the read-only bits. + match data.len() { + 2 => { + let value = LittleEndian::read_u16(data); + match offset { + MSI_MSG_CTL_OFFSET => { + self.msg_ctl = (self.msg_ctl & !(MSI_CTL_ENABLE | MSI_CTL_MULTI_MSG_ENABLE)) + | (value & (MSI_CTL_ENABLE | MSI_CTL_MULTI_MSG_ENABLE)) + } + x if x == msg_data_offset => self.msg_data = value, + _ => error!("invalid offset"), + } + } + 4 => { + let value = LittleEndian::read_u32(data); + match offset { + 0x0 => { + self.msg_ctl = (self.msg_ctl & !(MSI_CTL_ENABLE | MSI_CTL_MULTI_MSG_ENABLE)) + | ((value >> 16) as u16 & (MSI_CTL_ENABLE | MSI_CTL_MULTI_MSG_ENABLE)) + } + MSI_MSG_ADDR_LO_OFFSET => self.msg_addr_lo = value & MSI_MSG_ADDR_LO_MASK, + x if x == msg_data_offset => self.msg_data = value as u16, + x if addr_hi_offset.is_some() && x == addr_hi_offset.unwrap() => { + self.msg_addr_hi = value + } + x if mask_bits_offset.is_some() && x == mask_bits_offset.unwrap() => { + self.mask_bits = value + } + _ => error!("invalid offset"), + } + } + _ => error!("invalid data length"), + } + } +} + +#[derive(Serialize, Deserialize)] +pub struct MsiConfigState { + cap: MsiCap, +} + +pub struct MsiConfig { + pub cap: MsiCap, + interrupt_source_group: Arc, +} + +impl MsiConfig { + pub fn new( + msg_ctl: u16, + interrupt_source_group: Arc, + state: Option, + ) -> Result { + let cap = if let Some(state) = state { + if state.cap.enabled() { + for idx in 0..state.cap.num_enabled_vectors() { + let config = MsiIrqSourceConfig { + high_addr: state.cap.msg_addr_hi, + low_addr: state.cap.msg_addr_lo, + data: state.cap.msg_data as u32, + devid: 0, + }; + + interrupt_source_group + .update( + idx as InterruptIndex, + InterruptSourceConfig::MsiIrq(config), + state.cap.vector_masked(idx), + false, + ) + .map_err(Error::UpdateInterruptRoute)?; + } + + interrupt_source_group + .set_gsi() + .map_err(Error::EnableInterruptRoute)?; + + interrupt_source_group + .enable() + .map_err(Error::EnableInterruptRoute)?; + } + + state.cap + } else { + MsiCap { + msg_ctl, + ..Default::default() + } + }; + + Ok(MsiConfig { + cap, + interrupt_source_group, + }) + } + + pub fn state(&self) -> MsiConfigState { + MsiConfigState { cap: self.cap } + } + + pub fn enabled(&self) -> bool { + self.cap.enabled() + } + + pub fn size(&self) -> u64 { + self.cap.size() + } + + pub fn num_enabled_vectors(&self) -> usize { + self.cap.num_enabled_vectors() + } + + pub fn update(&mut self, offset: u64, data: &[u8]) { + let old_enabled = self.cap.enabled(); + + self.cap.update(offset, data); + + if self.cap.enabled() { + for idx in 0..self.num_enabled_vectors() { + let config = MsiIrqSourceConfig { + high_addr: self.cap.msg_addr_hi, + low_addr: self.cap.msg_addr_lo, + data: self.cap.msg_data as u32, + devid: 0, + }; + + if let Err(e) = self.interrupt_source_group.update( + idx as InterruptIndex, + InterruptSourceConfig::MsiIrq(config), + self.cap.vector_masked(idx), + true, + ) { + error!("Failed updating vector: {:?}", e); + } + } + + if !old_enabled { + if let Err(e) = self.interrupt_source_group.enable() { + error!("Failed enabling irq_fd: {:?}", e); + } + } + } else if old_enabled { + if let Err(e) = self.interrupt_source_group.disable() { + error!("Failed disabling irq_fd: {:?}", e); + } + } + } +} diff --git a/src/pci/src/msix.rs b/src/pci/src/msix.rs new file mode 100644 index 00000000000..4b3cf688980 --- /dev/null +++ b/src/pci/src/msix.rs @@ -0,0 +1,552 @@ +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause +// + +use std::sync::Arc; +use std::{io, result}; + +use byteorder::{ByteOrder, LittleEndian}; +use serde::{Deserialize, Serialize}; +use vm_device::interrupt::{ + InterruptIndex, InterruptSourceConfig, InterruptSourceGroup, MsiIrqSourceConfig, +}; +use vm_memory::ByteValued; + +use crate::{PciCapability, PciCapabilityId}; + +const MAX_MSIX_VECTORS_PER_DEVICE: u16 = 2048; +const MSIX_TABLE_ENTRIES_MODULO: u64 = 16; +const MSIX_PBA_ENTRIES_MODULO: u64 = 8; +const BITS_PER_PBA_ENTRY: usize = 64; +const FUNCTION_MASK_BIT: u8 = 14; +const MSIX_ENABLE_BIT: u8 = 15; +const FUNCTION_MASK_MASK: u16 = (1 << FUNCTION_MASK_BIT) as u16; +const MSIX_ENABLE_MASK: u16 = (1 << MSIX_ENABLE_BIT) as u16; +pub const MSIX_TABLE_ENTRY_SIZE: usize = 16; +pub const MSIX_CONFIG_ID: &str = "msix_config"; + +#[derive(Debug)] +pub enum Error { + /// Failed enabling the interrupt route. + EnableInterruptRoute(io::Error), + /// Failed updating the interrupt route. + UpdateInterruptRoute(io::Error), +} + +#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] +pub struct MsixTableEntry { + pub msg_addr_lo: u32, + pub msg_addr_hi: u32, + pub msg_data: u32, + pub vector_ctl: u32, +} + +impl MsixTableEntry { + pub fn masked(&self) -> bool { + self.vector_ctl & 0x1 == 0x1 + } +} + +impl Default for MsixTableEntry { + fn default() -> Self { + MsixTableEntry { + msg_addr_lo: 0, + msg_addr_hi: 0, + msg_data: 0, + vector_ctl: 0x1, + } + } +} + +#[derive(Serialize, Deserialize)] +pub struct MsixConfigState { + table_entries: Vec, + pba_entries: Vec, + masked: bool, + enabled: bool, +} + +pub struct MsixConfig { + pub table_entries: Vec, + pub pba_entries: Vec, + pub devid: u32, + interrupt_source_group: Arc, + masked: bool, + enabled: bool, +} + +impl MsixConfig { + pub fn new( + msix_vectors: u16, + interrupt_source_group: Arc, + devid: u32, + state: Option, + ) -> result::Result { + assert!(msix_vectors <= MAX_MSIX_VECTORS_PER_DEVICE); + + let (table_entries, pba_entries, masked, enabled) = if let Some(state) = state { + if state.enabled && !state.masked { + for (idx, table_entry) in state.table_entries.iter().enumerate() { + if table_entry.masked() { + continue; + } + + let config = MsiIrqSourceConfig { + high_addr: table_entry.msg_addr_hi, + low_addr: table_entry.msg_addr_lo, + data: table_entry.msg_data, + devid, + }; + + interrupt_source_group + .update( + idx as InterruptIndex, + InterruptSourceConfig::MsiIrq(config), + state.masked, + true, + ) + .map_err(Error::UpdateInterruptRoute)?; + + interrupt_source_group + .enable() + .map_err(Error::EnableInterruptRoute)?; + } + } + + ( + state.table_entries, + state.pba_entries, + state.masked, + state.enabled, + ) + } else { + let mut table_entries: Vec = Vec::new(); + table_entries.resize_with(msix_vectors as usize, Default::default); + let mut pba_entries: Vec = Vec::new(); + let num_pba_entries: usize = ((msix_vectors as usize) / BITS_PER_PBA_ENTRY) + 1; + pba_entries.resize_with(num_pba_entries, Default::default); + + (table_entries, pba_entries, true, false) + }; + + Ok(MsixConfig { + table_entries, + pba_entries, + devid, + interrupt_source_group, + masked, + enabled, + }) + } + + pub fn state(&self) -> MsixConfigState { + MsixConfigState { + table_entries: self.table_entries.clone(), + pba_entries: self.pba_entries.clone(), + masked: self.masked, + enabled: self.enabled, + } + } + + pub fn masked(&self) -> bool { + self.masked + } + + pub fn enabled(&self) -> bool { + self.enabled + } + + pub fn set_msg_ctl(&mut self, reg: u16) { + let old_masked = self.masked; + let old_enabled = self.enabled; + + self.masked = ((reg >> FUNCTION_MASK_BIT) & 1u16) == 1u16; + self.enabled = ((reg >> MSIX_ENABLE_BIT) & 1u16) == 1u16; + + // Update interrupt routing + if old_masked != self.masked || old_enabled != self.enabled { + if self.enabled && !self.masked { + debug!("MSI-X enabled for device 0x{:x}", self.devid); + for (idx, table_entry) in self.table_entries.iter().enumerate() { + let config = MsiIrqSourceConfig { + high_addr: table_entry.msg_addr_hi, + low_addr: table_entry.msg_addr_lo, + data: table_entry.msg_data, + devid: self.devid, + }; + + if let Err(e) = self.interrupt_source_group.update( + idx as InterruptIndex, + InterruptSourceConfig::MsiIrq(config), + table_entry.masked(), + true, + ) { + error!("Failed updating vector: {:?}", e); + } + } + } else if old_enabled || !old_masked { + debug!("MSI-X disabled for device 0x{:x}", self.devid); + if let Err(e) = self.interrupt_source_group.disable() { + error!("Failed disabling irq_fd: {:?}", e); + } + } + } + + // If the Function Mask bit was set, and has just been cleared, it's + // important to go through the entire PBA to check if there was any + // pending MSI-X message to inject, given that the vector is not + // masked. + if old_masked && !self.masked { + for (index, entry) in self.table_entries.clone().iter().enumerate() { + if !entry.masked() && self.get_pba_bit(index as u16) == 1 { + self.inject_msix_and_clear_pba(index); + } + } + } + } + + pub fn read_table(&self, offset: u64, data: &mut [u8]) { + assert!((data.len() == 4 || data.len() == 8)); + + let index: usize = (offset / MSIX_TABLE_ENTRIES_MODULO) as usize; + let modulo_offset = offset % MSIX_TABLE_ENTRIES_MODULO; + + if index >= self.table_entries.len() { + debug!("Invalid MSI-X table entry index {index}"); + data.copy_from_slice(&[0xff; 8][..data.len()]); + return; + } + + match data.len() { + 4 => { + let value = match modulo_offset { + 0x0 => self.table_entries[index].msg_addr_lo, + 0x4 => self.table_entries[index].msg_addr_hi, + 0x8 => self.table_entries[index].msg_data, + 0xc => self.table_entries[index].vector_ctl, + _ => { + error!("invalid offset"); + 0 + } + }; + + debug!("MSI_R TABLE offset 0x{:x} data 0x{:x}", offset, value); + LittleEndian::write_u32(data, value); + } + 8 => { + let value = match modulo_offset { + 0x0 => { + (u64::from(self.table_entries[index].msg_addr_hi) << 32) + | u64::from(self.table_entries[index].msg_addr_lo) + } + 0x8 => { + (u64::from(self.table_entries[index].vector_ctl) << 32) + | u64::from(self.table_entries[index].msg_data) + } + _ => { + error!("invalid offset"); + 0 + } + }; + + debug!("MSI_R TABLE offset 0x{:x} data 0x{:x}", offset, value); + LittleEndian::write_u64(data, value); + } + _ => { + error!("invalid data length"); + } + } + } + + pub fn write_table(&mut self, offset: u64, data: &[u8]) { + assert!((data.len() == 4 || data.len() == 8)); + + let index: usize = (offset / MSIX_TABLE_ENTRIES_MODULO) as usize; + let modulo_offset = offset % MSIX_TABLE_ENTRIES_MODULO; + + if index >= self.table_entries.len() { + debug!("Invalid MSI-X table entry index {index}"); + return; + } + + // Store the value of the entry before modification + let old_entry = self.table_entries[index].clone(); + + match data.len() { + 4 => { + let value = LittleEndian::read_u32(data); + match modulo_offset { + 0x0 => self.table_entries[index].msg_addr_lo = value, + 0x4 => self.table_entries[index].msg_addr_hi = value, + 0x8 => self.table_entries[index].msg_data = value, + 0xc => { + self.table_entries[index].vector_ctl = value; + } + _ => error!("invalid offset"), + }; + + debug!("MSI_W TABLE offset 0x{:x} data 0x{:x}", offset, value); + } + 8 => { + let value = LittleEndian::read_u64(data); + match modulo_offset { + 0x0 => { + self.table_entries[index].msg_addr_lo = (value & 0xffff_ffffu64) as u32; + self.table_entries[index].msg_addr_hi = (value >> 32) as u32; + } + 0x8 => { + self.table_entries[index].msg_data = (value & 0xffff_ffffu64) as u32; + self.table_entries[index].vector_ctl = (value >> 32) as u32; + } + _ => error!("invalid offset"), + }; + + debug!("MSI_W TABLE offset 0x{:x} data 0x{:x}", offset, value); + } + _ => error!("invalid data length"), + }; + + let table_entry = &self.table_entries[index]; + + // Optimisation to avoid excessive updates + if &old_entry == table_entry { + return; + } + + // Update interrupt routes + // Optimisation: only update routes if the entry is not masked; + // this is safe because if the entry is masked (starts masked as per spec) + // in the table then it won't be triggered. (See: #4273) + if self.enabled && !self.masked && !table_entry.masked() { + let config = MsiIrqSourceConfig { + high_addr: table_entry.msg_addr_hi, + low_addr: table_entry.msg_addr_lo, + data: table_entry.msg_data, + devid: self.devid, + }; + + if let Err(e) = self.interrupt_source_group.update( + index as InterruptIndex, + InterruptSourceConfig::MsiIrq(config), + table_entry.masked(), + true, + ) { + error!("Failed updating vector: {:?}", e); + } + } + + // After the MSI-X table entry has been updated, it is necessary to + // check if the vector control masking bit has changed. In case the + // bit has been flipped from 1 to 0, we need to inject a MSI message + // if the corresponding pending bit from the PBA is set. Once the MSI + // has been injected, the pending bit in the PBA needs to be cleared. + // All of this is valid only if MSI-X has not been masked for the whole + // device. + + // Check if bit has been flipped + if !self.masked() + && self.enabled() + && old_entry.masked() + && !table_entry.masked() + && self.get_pba_bit(index as u16) == 1 + { + self.inject_msix_and_clear_pba(index); + } + } + + pub fn read_pba(&mut self, offset: u64, data: &mut [u8]) { + assert!((data.len() == 4 || data.len() == 8)); + + let index: usize = (offset / MSIX_PBA_ENTRIES_MODULO) as usize; + let modulo_offset = offset % MSIX_PBA_ENTRIES_MODULO; + + if index >= self.pba_entries.len() { + debug!("Invalid MSI-X PBA entry index {index}"); + data.copy_from_slice(&[0xff; 8][..data.len()]); + return; + } + + match data.len() { + 4 => { + let value: u32 = match modulo_offset { + 0x0 => (self.pba_entries[index] & 0xffff_ffffu64) as u32, + 0x4 => (self.pba_entries[index] >> 32) as u32, + _ => { + error!("invalid offset"); + 0 + } + }; + + debug!("MSI_R PBA offset 0x{:x} data 0x{:x}", offset, value); + LittleEndian::write_u32(data, value); + } + 8 => { + let value: u64 = match modulo_offset { + 0x0 => self.pba_entries[index], + _ => { + error!("invalid offset"); + 0 + } + }; + + debug!("MSI_R PBA offset 0x{:x} data 0x{:x}", offset, value); + LittleEndian::write_u64(data, value); + } + _ => { + error!("invalid data length"); + } + } + } + + pub fn write_pba(&mut self, _offset: u64, _data: &[u8]) { + error!("Pending Bit Array is read only"); + } + + pub fn set_pba_bit(&mut self, vector: u16, reset: bool) { + assert!(vector < MAX_MSIX_VECTORS_PER_DEVICE); + + let index: usize = (vector as usize) / BITS_PER_PBA_ENTRY; + let shift: usize = (vector as usize) % BITS_PER_PBA_ENTRY; + let mut mask: u64 = (1 << shift) as u64; + + if reset { + mask = !mask; + self.pba_entries[index] &= mask; + } else { + self.pba_entries[index] |= mask; + } + } + + fn get_pba_bit(&self, vector: u16) -> u8 { + assert!(vector < MAX_MSIX_VECTORS_PER_DEVICE); + + let index: usize = (vector as usize) / BITS_PER_PBA_ENTRY; + let shift: usize = (vector as usize) % BITS_PER_PBA_ENTRY; + + ((self.pba_entries[index] >> shift) & 0x0000_0001u64) as u8 + } + + fn inject_msix_and_clear_pba(&mut self, vector: usize) { + // Inject the MSI message + match self + .interrupt_source_group + .trigger(vector as InterruptIndex) + { + Ok(_) => debug!("MSI-X injected on vector control flip"), + Err(e) => error!("failed to inject MSI-X: {}", e), + } + + // Clear the bit from PBA + self.set_pba_bit(vector as u16, true); + } +} + +#[allow(dead_code)] +#[repr(C, packed)] +#[derive(Clone, Copy, Default, Serialize, Deserialize)] +pub struct MsixCap { + // Message Control Register + // 10-0: MSI-X Table size + // 13-11: Reserved + // 14: Mask. Mask all MSI-X when set. + // 15: Enable. Enable all MSI-X when set. + pub msg_ctl: u16, + // Table. Contains the offset and the BAR indicator (BIR) + // 2-0: Table BAR indicator (BIR). Can be 0 to 5. + // 31-3: Table offset in the BAR pointed by the BIR. + pub table: u32, + // Pending Bit Array. Contains the offset and the BAR indicator (BIR) + // 2-0: PBA BAR indicator (BIR). Can be 0 to 5. + // 31-3: PBA offset in the BAR pointed by the BIR. + pub pba: u32, +} + +// SAFETY: All members are simple numbers and any value is valid. +unsafe impl ByteValued for MsixCap {} + +impl PciCapability for MsixCap { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::MsiX + } +} + +impl MsixCap { + pub fn new( + table_pci_bar: u8, + table_size: u16, + table_off: u32, + pba_pci_bar: u8, + pba_off: u32, + ) -> Self { + assert!(table_size < MAX_MSIX_VECTORS_PER_DEVICE); + + // Set the table size and enable MSI-X. + let msg_ctl: u16 = 0x8000u16 + table_size - 1; + + MsixCap { + msg_ctl, + table: (table_off & 0xffff_fff8u32) | u32::from(table_pci_bar & 0x7u8), + pba: (pba_off & 0xffff_fff8u32) | u32::from(pba_pci_bar & 0x7u8), + } + } + + pub fn set_msg_ctl(&mut self, data: u16) { + self.msg_ctl = (self.msg_ctl & !(FUNCTION_MASK_MASK | MSIX_ENABLE_MASK)) + | (data & (FUNCTION_MASK_MASK | MSIX_ENABLE_MASK)); + } + + pub fn masked(&self) -> bool { + (self.msg_ctl >> FUNCTION_MASK_BIT) & 0x1 == 0x1 + } + + pub fn enabled(&self) -> bool { + (self.msg_ctl >> MSIX_ENABLE_BIT) & 0x1 == 0x1 + } + + pub fn table_offset(&self) -> u32 { + self.table & 0xffff_fff8 + } + + pub fn pba_offset(&self) -> u32 { + self.pba & 0xffff_fff8 + } + + pub fn table_set_offset(&mut self, addr: u32) { + self.table &= 0x7; + self.table += addr; + } + + pub fn pba_set_offset(&mut self, addr: u32) { + self.pba &= 0x7; + self.pba += addr; + } + + pub fn table_bir(&self) -> u32 { + self.table & 0x7 + } + + pub fn pba_bir(&self) -> u32 { + self.pba & 0x7 + } + + pub fn table_size(&self) -> u16 { + (self.msg_ctl & 0x7ff) + 1 + } + + pub fn table_range(&self) -> (u64, u64) { + // The table takes 16 bytes per entry. + let size = self.table_size() as u64 * 16; + (self.table_offset() as u64, size) + } + + pub fn pba_range(&self) -> (u64, u64) { + // The table takes 1 bit per entry modulo 8 bytes. + let size = ((self.table_size() as u64 / 64) + 1) * 8; + (self.pba_offset() as u64, size) + } +} diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index e6c600378c1..6bfd64853b5 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -14,6 +14,7 @@ tracing = ["log-instrument"] gdb = ["arrayvec", "gdbstub", "gdbstub_arch"] [dependencies] + acpi_tables = { path = "../acpi-tables" } aes-gcm = { version = "0.10.1", default-features = false, features = ["aes"] } arrayvec = { version = "0.7.6", optional = true } @@ -38,6 +39,7 @@ log = { version = "0.4.27", features = ["std", "serde"] } log-instrument = { path = "../log-instrument", optional = true } memfd = "0.6.3" micro_http = { git = "https://github.com/firecracker-microvm/micro-http" } +pci = { path = "../pci" } semver = { version = "1.0.26", features = ["serde"] } serde = { version = "1.0.219", features = ["derive", "rc"] } serde_json = "1.0.142" @@ -46,6 +48,7 @@ thiserror = "2.0.12" timerfd = "1.5.0" userfaultfd = "0.8.1" utils = { path = "../utils" } +uuid = "1.16.0" vhost = { version = "0.14.0", features = ["vhost-user-frontend"] } vm-allocator = "0.1.3" vm-device = { path = "../vm-device" } From 00a6982fc569d05db4e316dfaefdf5344994b596 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Mon, 5 May 2025 17:52:08 +0200 Subject: [PATCH 15/99] arch: define 64-bit capable MMIO memory regions PCIe distinguishes MMIO regions between 32bit and 64bit, caring for devices that can't deal with 64-bit addresses. This commit defines the appropriate regions for both x86 and aarch64 architectures, extends the resource allocator to handle allocations for both of these regions and adjusts the logic that calculates the memory regions for the architecture. Also, un-do the change that added an `offset` argument `arch_memory_regions` function. We won't be using this for "secret hiding" so it just made the logic (especially for kani proofs) too convoluted. Signed-off-by: Babis Chalios --- src/vmm/src/arch/aarch64/gic/gicv2/mod.rs | 2 +- src/vmm/src/arch/aarch64/gic/gicv3/mod.rs | 2 +- src/vmm/src/arch/aarch64/layout.rs | 69 ++++- src/vmm/src/arch/aarch64/mod.rs | 144 ++++++---- src/vmm/src/arch/mod.rs | 52 +++- src/vmm/src/arch/x86_64/layout.rs | 45 ++++ src/vmm/src/arch/x86_64/mod.rs | 251 ++++++++++-------- src/vmm/src/builder.rs | 10 +- src/vmm/src/device_manager/mmio.rs | 41 +-- src/vmm/src/device_manager/mod.rs | 7 +- src/vmm/src/device_manager/persist.rs | 23 +- src/vmm/src/device_manager/resources.rs | 45 +++- .../src/devices/virtio/vsock/event_handler.rs | 16 +- src/vmm/src/resources.rs | 2 +- src/vmm/src/test_utils/mod.rs | 4 +- 15 files changed, 457 insertions(+), 256 deletions(-) diff --git a/src/vmm/src/arch/aarch64/gic/gicv2/mod.rs b/src/vmm/src/arch/aarch64/gic/gicv2/mod.rs index 22aaa4b4b74..c4b9208a0a6 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv2/mod.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv2/mod.rs @@ -30,7 +30,7 @@ impl GICv2 { /// Get the address of the GICv2 distributor. const fn get_dist_addr() -> u64 { - super::layout::MAPPED_IO_START - GICv2::KVM_VGIC_V2_DIST_SIZE + super::layout::MMIO32_MEM_START - GICv2::KVM_VGIC_V2_DIST_SIZE } /// Get the size of the GIC_v2 distributor. diff --git a/src/vmm/src/arch/aarch64/gic/gicv3/mod.rs b/src/vmm/src/arch/aarch64/gic/gicv3/mod.rs index 558b47ab065..39c4e5ce148 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv3/mod.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv3/mod.rs @@ -30,7 +30,7 @@ impl GICv3 { /// Get the address of the GIC distributor. fn get_dist_addr() -> u64 { - super::layout::MAPPED_IO_START - GICv3::KVM_VGIC_V3_DIST_SIZE + super::layout::MMIO32_MEM_START - GICv3::KVM_VGIC_V3_DIST_SIZE } /// Get the size of the GIC distributor. diff --git a/src/vmm/src/arch/aarch64/layout.rs b/src/vmm/src/arch/aarch64/layout.rs index 8f95519830e..4b1f6ecda5b 100644 --- a/src/vmm/src/arch/aarch64/layout.rs +++ b/src/vmm/src/arch/aarch64/layout.rs @@ -4,51 +4,53 @@ // ==== Address map in use in ARM development systems today ==== // // - 32-bit - - 36-bit - - 40-bit - -// 1024GB + + +-------------------+ <- 40-bit +// 1024GB + + +-------------------+ <- 40-bit // | | DRAM | // ~ ~ ~ ~ // | | | // | | | // | | | // | | | -// 544GB + + +-------------------+ +// 544GB + + +-------------------+ // | | Hole or DRAM | // | | | -// 512GB + + +-------------------+ +// 512GB + + +-------------------+ // | | Mapped | // | | I/O | // ~ ~ ~ ~ // | | | -// 256GB + + +-------------------+ +// 256GB + + +-------------------+ // | | Reserved | // ~ ~ ~ ~ // | | | -// 64GB + +-----------------------+-------------------+ <- 36-bit +// 64GB + +-----------------------+-------------------+ <- 36-bit // | | DRAM | // ~ ~ ~ ~ // | | | // | | | -// 34GB + +-----------------------+-------------------+ +// 34GB + +-----------------------+-------------------+ // | | Hole or DRAM | -// 32GB + +-----------------------+-------------------+ +// 32GB + +-----------------------+-------------------+ // | | Mapped I/O | // ~ ~ ~ ~ // | | | -// 16GB + +-----------------------+-------------------+ +// 16GB + +-----------------------+-------------------+ // | | Reserved | // ~ ~ ~ ~ -// 4GB +-------------------+-----------------------+-------------------+ <- 32-bit +// 4GB +-------------------+-----------------------+-------------------+ <- 32-bit // | 2GB of DRAM | // | | -// 2GB +-------------------+-----------------------+-------------------+ +// 2GB +-------------------+-----------------------+-------------------+ // | Mapped I/O | -// 1GB +-------------------+-----------------------+-------------------+ +// 1GB +-------------------+-----------------------+-------------------+ // | ROM & RAM & I/O | -// 0GB +-------------------+-----------------------+-------------------+ 0 +// 0GB +-------------------+-----------------------+-------------------+ 0 // - 32-bit - - 36-bit - - 40-bit - // // Taken from (http://infocenter.arm.com/help/topic/com.arm.doc.den0001c/DEN0001C_principles_of_arm_memory_maps.pdf). +use crate::device_manager::mmio::MMIO_LEN; + /// Start of RAM on 64 bit ARM. pub const DRAM_MEM_START: u64 = 0x8000_0000; // 2 GB. /// The maximum RAM size. @@ -88,5 +90,46 @@ pub const GSI_BASE: u32 = 0; /// The maximum usable GSI on aarch64. pub const GSI_MAX: u32 = IRQ_MAX - IRQ_BASE - 1; +/// The start of the memory area reserved for MMIO 32-bit accesses. /// Below this address will reside the GIC, above this address will reside the MMIO devices. -pub const MAPPED_IO_START: u64 = 1 << 30; // 1 GB +pub const MMIO32_MEM_START: u64 = 1 << 30; // 1GiB +/// The size of the memory area reserved for MMIO 32-bit accesses (1GiB). +pub const MMIO32_MEM_SIZE: u64 = DRAM_MEM_START - MMIO32_MEM_START; + +// The rest of the MMIO address space (256 MiB) we dedicate to PCIe for memory-mapped access to +// configuration. +/// Size of MMIO region for PCIe configuration accesses. +pub const PCI_MMCONFIG_SIZE: u64 = 256 << 20; +/// Start of MMIO region for PCIe configuration accesses. +pub const PCI_MMCONFIG_START: u64 = DRAM_MEM_START - PCI_MMCONFIG_SIZE; +/// MMIO space per PCIe segment +pub const PCI_MMIO_CONFIG_SIZE_PER_SEGMENT: u64 = 4096 * 256; + +// We reserve 768 MiB for devices at the beginning of the MMIO region. This includes space both for +// pure MMIO and PCIe devices. + +/// Memory region start for boot device. +pub const BOOT_DEVICE_MEM_START: u64 = MMIO32_MEM_START; +/// Memory region start for RTC device. +pub const RTC_MEM_START: u64 = BOOT_DEVICE_MEM_START + MMIO_LEN; +/// Memory region start for Serial device. +pub const SERIAL_MEM_START: u64 = RTC_MEM_START + MMIO_LEN; + +/// Beginning of memory region for device MMIO 32-bit accesses +pub const MEM_32BIT_DEVICES_START: u64 = SERIAL_MEM_START + MMIO_LEN; +/// Size of memory region for device MMIO 32-bit accesses +pub const MEM_32BIT_DEVICES_SIZE: u64 = PCI_MMCONFIG_START - MEM_32BIT_DEVICES_START; + +// 64-bits region for MMIO accesses +/// The start of the memory area reserved for MMIO 64-bit accesses. +pub const MMIO64_MEM_START: u64 = 256 << 30; +/// The size of the memory area reserved for MMIO 64-bit accesses. +pub const MMIO64_MEM_SIZE: u64 = 256 << 30; + +// At the moment, all of this region goes to devices +/// Beginning of memory region for device MMIO 64-bit accesses +pub const MEM_64BIT_DEVICES_START: u64 = MMIO64_MEM_START; +/// Size of memory region for device MMIO 32-bit accesses +pub const MEM_64BIT_DEVICES_SIZE: u64 = MMIO64_MEM_SIZE; +/// First address past the 64-bit MMIO gap +pub const FIRST_ADDR_PAST_64BITS_MMIO: u64 = MMIO64_MEM_START + MMIO64_MEM_SIZE; diff --git a/src/vmm/src/arch/aarch64/mod.rs b/src/vmm/src/arch/aarch64/mod.rs index 6d1d0e26359..df6e712dcf5 100644 --- a/src/vmm/src/arch/aarch64/mod.rs +++ b/src/vmm/src/arch/aarch64/mod.rs @@ -24,11 +24,11 @@ use linux_loader::loader::pe::PE as Loader; use linux_loader::loader::{Cmdline, KernelLoader}; use vm_memory::GuestMemoryError; -use crate::arch::{BootProtocol, EntryPoint}; +use crate::arch::{BootProtocol, EntryPoint, arch_memory_regions_with_gap}; use crate::cpu_config::aarch64::{CpuConfiguration, CpuConfigurationError}; use crate::cpu_config::templates::CustomCpuTemplate; use crate::initrd::InitrdConfig; -use crate::utils::{align_up, usize_to_u64}; +use crate::utils::{align_up, u64_to_usize, usize_to_u64}; use crate::vmm_config::machine_config::MachineConfig; use crate::vstate::memory::{Address, Bytes, GuestAddress, GuestMemory, GuestMemoryMmap}; use crate::vstate::vcpu::KvmVcpuError; @@ -51,42 +51,34 @@ pub enum ConfigurationError { VcpuConfigure(#[from] KvmVcpuError), } -/// The start of the memory area reserved for MMIO devices. -pub const MMIO_MEM_START: u64 = layout::MAPPED_IO_START; -/// The size of the memory area reserved for MMIO devices. -pub const MMIO_MEM_SIZE: u64 = layout::DRAM_MEM_START - layout::MAPPED_IO_START; //>> 1GB - /// Returns a Vec of the valid memory addresses for aarch64. /// See [`layout`](layout) module for a drawing of the specific memory model for this platform. -/// -/// The `offset` parameter specified the offset from [`layout::DRAM_MEM_START`]. -pub fn arch_memory_regions(offset: usize, size: usize) -> Vec<(GuestAddress, usize)> { +pub fn arch_memory_regions(size: usize) -> Vec<(GuestAddress, usize)> { assert!(size > 0, "Attempt to allocate guest memory of length 0"); - assert!( - offset.checked_add(size).is_some(), - "Attempt to allocate guest memory such that the address space would wrap around" - ); - assert!( - offset < layout::DRAM_MEM_MAX_SIZE, - "offset outside allowed DRAM range" - ); - let dram_size = min(size, layout::DRAM_MEM_MAX_SIZE - offset); + let dram_size = min(size, layout::DRAM_MEM_MAX_SIZE); if dram_size != size { logger::warn!( - "Requested offset/memory size {}/{} exceeds architectural maximum (1022GiB). Size has \ - been truncated to {}", - offset, + "Requested memory size {} exceeds architectural maximum (1022GiB). Size has been \ + truncated to {}", size, dram_size ); } - vec![( - GuestAddress(layout::DRAM_MEM_START + offset as u64), + let mut regions = vec![]; + if let Some((offset, remaining)) = arch_memory_regions_with_gap( + &mut regions, + u64_to_usize(layout::DRAM_MEM_START), dram_size, - )] + u64_to_usize(layout::MMIO64_MEM_START), + u64_to_usize(layout::MMIO64_MEM_SIZE), + ) { + regions.push((GuestAddress(offset as u64), remaining)); + } + + regions } /// Configures the system for booting Linux. @@ -211,39 +203,66 @@ pub fn load_kernel( #[cfg(kani)] mod verification { - use vm_memory::GuestAddress; - - use crate::arch::aarch64::layout; + use crate::arch::aarch64::layout::{ + DRAM_MEM_MAX_SIZE, DRAM_MEM_START, FIRST_ADDR_PAST_64BITS_MMIO, MMIO64_MEM_START, + }; use crate::arch::arch_memory_regions; #[kani::proof] #[kani::unwind(3)] fn verify_arch_memory_regions() { - let offset: u64 = kani::any::(); - let len: u64 = kani::any::(); - + let len: usize = kani::any::(); kani::assume(len > 0); - kani::assume(offset.checked_add(len).is_some()); - kani::assume(offset < layout::DRAM_MEM_MAX_SIZE as u64); - let regions = arch_memory_regions(offset as usize, len as usize); + let regions = arch_memory_regions(len); - // No MMIO gap on ARM - assert_eq!(regions.len(), 1); + for region in ®ions { + println!( + "region: [{:x}:{:x})", + region.0.0, + region.0.0 + region.1 as u64 + ); + } - let (GuestAddress(start), actual_len) = regions[0]; - let actual_len = actual_len as u64; + // On Arm we have one MMIO gap that might fall within addressable ranges, + // so we can get either 1 or 2 regions. + assert!(regions.len() >= 1); + assert!(regions.len() <= 2); - assert_eq!(start, layout::DRAM_MEM_START + offset); - assert!(actual_len <= layout::DRAM_MEM_MAX_SIZE as u64); + // The total length of all regions cannot exceed DRAM_MEM_MAX_SIZE + let actual_len = regions.iter().map(|&(_, len)| len).sum::(); + assert!(actual_len <= DRAM_MEM_MAX_SIZE); + // The total length is smaller or equal to the length we asked assert!(actual_len <= len); + // If it's smaller, it's because we asked more than the the maximum possible. + if (actual_len) < len { + assert!(len > DRAM_MEM_MAX_SIZE); + } - if actual_len < len { - assert_eq!( - start + actual_len, - layout::DRAM_MEM_START + layout::DRAM_MEM_MAX_SIZE as u64 - ); - assert!(offset + len >= layout::DRAM_MEM_MAX_SIZE as u64); + // No region overlaps the 64-bit MMIO gap + assert!( + regions + .iter() + .all(|&(start, len)| start.0 >= FIRST_ADDR_PAST_64BITS_MMIO + || start.0 + len as u64 <= MMIO64_MEM_START) + ); + + // All regions start after our DRAM_MEM_START + assert!(regions.iter().all(|&(start, _)| start.0 >= DRAM_MEM_START)); + + // All regions have non-zero length + assert!(regions.iter().all(|&(_, len)| len > 0)); + + // If there's two regions, they perfectly snuggle up the 64bit MMIO gap + if regions.len() == 2 { + kani::cover!(); + + // The very first address should be DRAM_MEM_START + assert_eq!(regions[0].0.0, DRAM_MEM_START); + // The first region ends at the beginning of the 64 bits gap. + assert_eq!(regions[0].0.0 + regions[0].1 as u64, MMIO64_MEM_START); + // The second region starts exactly after the 64 bits gap. + assert_eq!(regions[1].0.0, FIRST_ADDR_PAST_64BITS_MMIO); } } } @@ -251,33 +270,42 @@ mod verification { #[cfg(test)] mod tests { use super::*; + use crate::arch::aarch64::layout::{ + DRAM_MEM_MAX_SIZE, DRAM_MEM_START, FDT_MAX_SIZE, FIRST_ADDR_PAST_64BITS_MMIO, + MMIO64_MEM_START, + }; use crate::test_utils::arch_mem; #[test] fn test_regions_lt_1024gb() { - let regions = arch_memory_regions(0, 1usize << 29); + let regions = arch_memory_regions(1usize << 29); assert_eq!(1, regions.len()); - assert_eq!(GuestAddress(super::layout::DRAM_MEM_START), regions[0].0); + assert_eq!(GuestAddress(DRAM_MEM_START), regions[0].0); assert_eq!(1usize << 29, regions[0].1); } #[test] fn test_regions_gt_1024gb() { - let regions = arch_memory_regions(0, 1usize << 41); - assert_eq!(1, regions.len()); - assert_eq!(GuestAddress(super::layout::DRAM_MEM_START), regions[0].0); - assert_eq!(super::layout::DRAM_MEM_MAX_SIZE, regions[0].1); + let regions = arch_memory_regions(1usize << 41); + assert_eq!(2, regions.len()); + assert_eq!(GuestAddress(DRAM_MEM_START), regions[0].0); + assert_eq!(MMIO64_MEM_START - DRAM_MEM_START, regions[0].1 as u64); + assert_eq!(GuestAddress(FIRST_ADDR_PAST_64BITS_MMIO), regions[1].0); + assert_eq!( + DRAM_MEM_MAX_SIZE as u64 - MMIO64_MEM_START + DRAM_MEM_START, + regions[1].1 as u64 + ); } #[test] fn test_get_fdt_addr() { - let mem = arch_mem(layout::FDT_MAX_SIZE - 0x1000); - assert_eq!(get_fdt_addr(&mem), layout::DRAM_MEM_START); + let mem = arch_mem(FDT_MAX_SIZE - 0x1000); + assert_eq!(get_fdt_addr(&mem), DRAM_MEM_START); - let mem = arch_mem(layout::FDT_MAX_SIZE); - assert_eq!(get_fdt_addr(&mem), layout::DRAM_MEM_START); + let mem = arch_mem(FDT_MAX_SIZE); + assert_eq!(get_fdt_addr(&mem), DRAM_MEM_START); - let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); - assert_eq!(get_fdt_addr(&mem), 0x1000 + layout::DRAM_MEM_START); + let mem = arch_mem(FDT_MAX_SIZE + 0x1000); + assert_eq!(get_fdt_addr(&mem), 0x1000 + DRAM_MEM_START); } } diff --git a/src/vmm/src/arch/mod.rs b/src/vmm/src/arch/mod.rs index ebd270a2e61..0d72241b13c 100644 --- a/src/vmm/src/arch/mod.rs +++ b/src/vmm/src/arch/mod.rs @@ -20,10 +20,14 @@ pub use aarch64::vcpu::*; pub use aarch64::vm::{ArchVm, ArchVmError, VmState}; #[cfg(target_arch = "aarch64")] pub use aarch64::{ - ConfigurationError, MMIO_MEM_SIZE, MMIO_MEM_START, arch_memory_regions, - configure_system_for_boot, get_kernel_start, initrd_load_addr, layout::CMDLINE_MAX_SIZE, - layout::GSI_BASE, layout::GSI_MAX, layout::IRQ_BASE, layout::IRQ_MAX, layout::SYSTEM_MEM_SIZE, - layout::SYSTEM_MEM_START, load_kernel, + ConfigurationError, arch_memory_regions, configure_system_for_boot, get_kernel_start, + initrd_load_addr, layout::BOOT_DEVICE_MEM_START, layout::CMDLINE_MAX_SIZE, layout::GSI_BASE, + layout::GSI_MAX, layout::IRQ_BASE, layout::IRQ_MAX, layout::MEM_32BIT_DEVICES_SIZE, + layout::MEM_32BIT_DEVICES_START, layout::MEM_64BIT_DEVICES_SIZE, + layout::MEM_64BIT_DEVICES_START, layout::MMIO32_MEM_SIZE, layout::MMIO32_MEM_START, + layout::PCI_MMCONFIG_SIZE, layout::PCI_MMCONFIG_START, + layout::PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, layout::RTC_MEM_START, layout::SERIAL_MEM_START, + layout::SYSTEM_MEM_SIZE, layout::SYSTEM_MEM_START, load_kernel, }; /// Module for x86_64 related functionality. @@ -39,11 +43,12 @@ pub use x86_64::vm::{ArchVm, ArchVmError, VmState}; #[cfg(target_arch = "x86_64")] pub use crate::arch::x86_64::{ - ConfigurationError, MMIO_MEM_SIZE, MMIO_MEM_START, arch_memory_regions, - configure_system_for_boot, get_kernel_start, initrd_load_addr, layout::APIC_ADDR, - layout::CMDLINE_MAX_SIZE, layout::GSI_BASE, layout::GSI_MAX, layout::IOAPIC_ADDR, - layout::IRQ_BASE, layout::IRQ_MAX, layout::SYSTEM_MEM_SIZE, layout::SYSTEM_MEM_START, - load_kernel, + ConfigurationError, arch_memory_regions, configure_system_for_boot, get_kernel_start, + initrd_load_addr, layout::APIC_ADDR, layout::BOOT_DEVICE_MEM_START, layout::CMDLINE_MAX_SIZE, + layout::GSI_BASE, layout::GSI_MAX, layout::IOAPIC_ADDR, layout::IRQ_BASE, layout::IRQ_MAX, + layout::MEM_32BIT_DEVICES_SIZE, layout::MEM_32BIT_DEVICES_START, + layout::MEM_64BIT_DEVICES_SIZE, layout::MEM_64BIT_DEVICES_START, layout::MMIO32_MEM_SIZE, + layout::MMIO32_MEM_START, layout::SYSTEM_MEM_SIZE, layout::SYSTEM_MEM_START, load_kernel, }; /// Types of devices that can get attached to this platform. @@ -115,3 +120,32 @@ pub struct EntryPoint { /// Specifies which boot protocol to use pub protocol: BootProtocol, } + +/// Adds in [`regions`] the valid memory regions suitable for RAM taking into account a gap in the +/// available address space and returns the remaining region (if any) past this gap +fn arch_memory_regions_with_gap( + regions: &mut Vec<(GuestAddress, usize)>, + region_start: usize, + region_size: usize, + gap_start: usize, + gap_size: usize, +) -> Option<(usize, usize)> { + // 0-sized gaps don't really make sense. We should never receive such a gap. + assert!(gap_size > 0); + + let first_addr_past_gap = gap_start + gap_size; + match (region_start + region_size).checked_sub(gap_start) { + // case0: region fits all before gap + None | Some(0) => { + regions.push((GuestAddress(region_start as u64), region_size)); + None + } + // case1: region starts before the gap and goes past it + Some(remaining) if region_start < gap_start => { + regions.push((GuestAddress(region_start as u64), gap_start - region_start)); + Some((first_addr_past_gap, remaining)) + } + // case2: region starts past the gap + Some(_) => Some((first_addr_past_gap.max(region_start), region_size)), + } +} diff --git a/src/vmm/src/arch/x86_64/layout.rs b/src/vmm/src/arch/x86_64/layout.rs index a4c2f036906..b7d5eb6dc5f 100644 --- a/src/vmm/src/arch/x86_64/layout.rs +++ b/src/vmm/src/arch/x86_64/layout.rs @@ -7,6 +7,9 @@ //! Magic addresses externally used to lay out x86_64 VMs. +use crate::device_manager::mmio::MMIO_LEN; +use crate::utils::mib_to_bytes; + /// Initial stack for the boot CPU. pub const BOOT_STACK_POINTER: u64 = 0x8ff0; @@ -83,3 +86,45 @@ pub const SYSTEM_MEM_START: u64 = 0x9fc00; /// 257KiB is more than we need, however we reserve this space for potential future use of /// ACPI features (new tables and/or devices). pub const SYSTEM_MEM_SIZE: u64 = RSDP_ADDR - SYSTEM_MEM_START; + +/// First address that cannot be addressed using 32 bit anymore. +pub const FIRST_ADDR_PAST_32BITS: u64 = 1 << 32; + +/// The size of the memory area reserved for MMIO 32-bit accesses. +pub const MMIO32_MEM_SIZE: u64 = mib_to_bytes(1024) as u64; +/// The start of the memory area reserved for MMIO 32-bit accesses. +pub const MMIO32_MEM_START: u64 = FIRST_ADDR_PAST_32BITS - MMIO32_MEM_SIZE; + +// We dedicate the last 256 MiB of the 32-bit MMIO address space PCIe for memory-mapped access to +// configuration. +/// Size of MMIO region for PCIe configuration accesses. +pub const PCI_MMCONFIG_SIZE: u64 = 256 << 20; +/// Start of MMIO region for PCIe configuration accesses. +pub const PCI_MMCONFIG_START: u64 = IOAPIC_ADDR as u64 - PCI_MMCONFIG_SIZE; +/// MMIO space per PCIe segment +pub const PCI_MMIO_CONFIG_SIZE_PER_SEGMENT: u64 = 4096 * 256; + +// We reserve 768 MiB for devices at the beginning of the MMIO region. This includes space both for +// pure MMIO and PCIe devices. + +/// Memory region start for boot device. +pub const BOOT_DEVICE_MEM_START: u64 = MMIO32_MEM_START; + +/// Beginning of memory region for device MMIO 32-bit accesses +pub const MEM_32BIT_DEVICES_START: u64 = BOOT_DEVICE_MEM_START + MMIO_LEN; +/// Size of memory region for device MMIO 32-bit accesses +pub const MEM_32BIT_DEVICES_SIZE: u64 = PCI_MMCONFIG_START - MEM_32BIT_DEVICES_START; + +// 64-bits region for MMIO accesses +/// The start of the memory area reserved for MMIO 64-bit accesses. +pub const MMIO64_MEM_START: u64 = 256 << 30; +/// The size of the memory area reserved for MMIO 64-bit accesses. +pub const MMIO64_MEM_SIZE: u64 = 256 << 30; + +// At the moment, all of this region goes to devices +/// Beginning of memory region for device MMIO 64-bit accesses +pub const MEM_64BIT_DEVICES_START: u64 = MMIO64_MEM_START; +/// Size of memory region for device MMIO 32-bit accesses +pub const MEM_64BIT_DEVICES_SIZE: u64 = MMIO64_MEM_SIZE; +/// First address past the 64-bit MMIO gap +pub const FIRST_ADDR_PAST_64BITS_MMIO: u64 = MMIO64_MEM_START + MMIO64_MEM_SIZE; diff --git a/src/vmm/src/arch/x86_64/mod.rs b/src/vmm/src/arch/x86_64/mod.rs index c54ec46c987..add5bd52dd7 100644 --- a/src/vmm/src/arch/x86_64/mod.rs +++ b/src/vmm/src/arch/x86_64/mod.rs @@ -33,7 +33,10 @@ pub mod generated; use std::fs::File; -use layout::CMDLINE_START; +use layout::{ + CMDLINE_START, FIRST_ADDR_PAST_32BITS, FIRST_ADDR_PAST_64BITS_MMIO, MMIO32_MEM_SIZE, + MMIO32_MEM_START, MMIO64_MEM_SIZE, MMIO64_MEM_START, PCI_MMCONFIG_SIZE, PCI_MMCONFIG_START, +}; use linux_loader::configurator::linux::LinuxBootConfigurator; use linux_loader::configurator::pvh::PvhBootConfigurator; use linux_loader::configurator::{BootConfigurator, BootParams}; @@ -47,17 +50,17 @@ use log::debug; use super::EntryPoint; use crate::acpi::create_acpi_tables; -use crate::arch::{BootProtocol, SYSTEM_MEM_SIZE, SYSTEM_MEM_START}; +use crate::arch::{BootProtocol, SYSTEM_MEM_SIZE, SYSTEM_MEM_START, arch_memory_regions_with_gap}; use crate::cpu_config::templates::{CustomCpuTemplate, GuestConfigError}; use crate::cpu_config::x86_64::CpuConfiguration; use crate::initrd::InitrdConfig; -use crate::utils::{align_down, mib_to_bytes, u64_to_usize, usize_to_u64}; +use crate::utils::{align_down, u64_to_usize, usize_to_u64}; use crate::vmm_config::machine_config::MachineConfig; use crate::vstate::memory::{ Address, GuestAddress, GuestMemory, GuestMemoryMmap, GuestMemoryRegion, }; use crate::vstate::vcpu::KvmVcpuConfigureError; -use crate::{Vcpu, VcpuConfig, Vmm}; +use crate::{Vcpu, VcpuConfig, Vmm, logger}; // Value taken from https://elixir.bootlin.com/linux/v5.10.68/source/arch/x86/include/uapi/asm/e820.h#L31 // Usable normal RAM @@ -96,48 +99,53 @@ pub enum ConfigurationError { Acpi(#[from] crate::acpi::AcpiError), } -/// First address that cannot be addressed using 32 bit anymore. -pub const FIRST_ADDR_PAST_32BITS: u64 = 1 << 32; - -/// Size of MMIO gap at top of 32-bit address space. -pub const MEM_32BIT_GAP_SIZE: u64 = mib_to_bytes(768) as u64; -/// The start of the memory area reserved for MMIO devices. -pub const MMIO_MEM_START: u64 = FIRST_ADDR_PAST_32BITS - MEM_32BIT_GAP_SIZE; -/// The size of the memory area reserved for MMIO devices. -pub const MMIO_MEM_SIZE: u64 = MEM_32BIT_GAP_SIZE; - /// Returns a Vec of the valid memory addresses. /// These should be used to configure the GuestMemoryMmap structure for the platform. -/// For x86_64 all addresses are valid from the start of the kernel except a -/// carve out at the end of 32bit address space. -pub fn arch_memory_regions(offset: usize, size: usize) -> Vec<(GuestAddress, usize)> { +/// For x86_64 all addresses are valid from the start of the kernel except an 1GB +/// carve out at the end of 32bit address space and a second 256GB one at the 256GB limit. +pub fn arch_memory_regions(size: usize) -> Vec<(GuestAddress, usize)> { // If we get here with size == 0 something has seriously gone wrong. Firecracker should never // try to allocate guest memory of size 0 assert!(size > 0, "Attempt to allocate guest memory of length 0"); - assert!( - offset.checked_add(size).is_some(), - "Attempt to allocate guest memory such that the address space would wrap around" + + let dram_size = std::cmp::min( + usize::MAX - u64_to_usize(MMIO32_MEM_SIZE) - u64_to_usize(MMIO64_MEM_SIZE), + size, ); - // It's safe to cast MMIO_MEM_START to usize because it fits in a u32 variable - // (It points to an address in the 32 bit space). - match (size + offset).checked_sub(u64_to_usize(MMIO_MEM_START)) { - // case1: guest memory fits before the gap - None | Some(0) => vec![(GuestAddress(offset as u64), size)], - // case2: starts before the gap, but doesn't completely fit - Some(remaining) if (offset as u64) < MMIO_MEM_START => vec![ - ( - GuestAddress(offset as u64), - u64_to_usize(MMIO_MEM_START) - offset, - ), - (GuestAddress(FIRST_ADDR_PAST_32BITS), remaining), - ], - // case3: guest memory start after the gap - Some(_) => vec![( - GuestAddress(FIRST_ADDR_PAST_32BITS.max(offset as u64)), + if dram_size != size { + logger::warn!( + "Requested memory size {} exceeds architectural maximum (1022GiB). Size has been \ + truncated to {}", size, - )], + dram_size + ); } + + let mut regions = vec![]; + + if let Some((start_past_32bit_gap, remaining_past_32bit_gap)) = arch_memory_regions_with_gap( + &mut regions, + 0, + dram_size, + u64_to_usize(MMIO32_MEM_START), + u64_to_usize(MMIO32_MEM_SIZE), + ) { + if let Some((start_past_64bit_gap, remaining_past_64bit_gap)) = arch_memory_regions_with_gap( + &mut regions, + start_past_32bit_gap, + remaining_past_32bit_gap, + u64_to_usize(MMIO64_MEM_START), + u64_to_usize(MMIO64_MEM_SIZE), + ) { + regions.push(( + GuestAddress(start_past_64bit_gap as u64), + remaining_past_64bit_gap, + )); + } + } + + regions } /// Returns the memory address where the kernel could be loaded. @@ -237,7 +245,9 @@ fn configure_pvh( ) -> Result<(), ConfigurationError> { const XEN_HVM_START_MAGIC_VALUE: u32 = 0x336e_c578; let first_addr_past_32bits = GuestAddress(FIRST_ADDR_PAST_32BITS); - let end_32bit_gap_start = GuestAddress(MMIO_MEM_START); + let end_32bit_gap_start = GuestAddress(MMIO32_MEM_START); + let first_addr_past_64bits = GuestAddress(FIRST_ADDR_PAST_64BITS_MMIO); + let end_64bit_gap_start = GuestAddress(MMIO64_MEM_START); let himem_start = GuestAddress(layout::HIMEM_START); // Vector to hold modules (currently either empty or holding initrd). @@ -269,32 +279,42 @@ fn configure_pvh( type_: E820_RESERVED, ..Default::default() }); + memmap.push(hvm_memmap_table_entry { + addr: PCI_MMCONFIG_START, + size: PCI_MMCONFIG_SIZE, + type_: E820_RESERVED, + ..Default::default() + }); let last_addr = guest_mem.last_addr(); - if last_addr < end_32bit_gap_start { + + if last_addr > first_addr_past_64bits { memmap.push(hvm_memmap_table_entry { - addr: himem_start.raw_value(), - size: last_addr.unchecked_offset_from(himem_start) + 1, + addr: first_addr_past_64bits.raw_value(), + size: last_addr.unchecked_offset_from(first_addr_past_64bits) + 1, type_: MEMMAP_TYPE_RAM, ..Default::default() }); - } else { + } + + if last_addr > first_addr_past_32bits { memmap.push(hvm_memmap_table_entry { - addr: himem_start.raw_value(), - size: end_32bit_gap_start.unchecked_offset_from(himem_start), + addr: first_addr_past_32bits.raw_value(), + size: (end_64bit_gap_start.unchecked_offset_from(first_addr_past_32bits)) + .min(last_addr.unchecked_offset_from(first_addr_past_32bits) + 1), type_: MEMMAP_TYPE_RAM, ..Default::default() }); - - if last_addr > first_addr_past_32bits { - memmap.push(hvm_memmap_table_entry { - addr: first_addr_past_32bits.raw_value(), - size: last_addr.unchecked_offset_from(first_addr_past_32bits) + 1, - type_: MEMMAP_TYPE_RAM, - ..Default::default() - }); - } } + memmap.push(hvm_memmap_table_entry { + addr: himem_start.raw_value(), + size: end_32bit_gap_start + .unchecked_offset_from(himem_start) + .min(last_addr.unchecked_offset_from(himem_start) + 1), + type_: MEMMAP_TYPE_RAM, + ..Default::default() + }); + // Construct the hvm_start_info structure and serialize it into // boot_params. This will be stored at PVH_INFO_START address, and %rbx // will be initialized to contain PVH_INFO_START prior to starting the @@ -340,7 +360,9 @@ fn configure_64bit_boot( const KERNEL_LOADER_OTHER: u8 = 0xff; const KERNEL_MIN_ALIGNMENT_BYTES: u32 = 0x0100_0000; // Must be non-zero. let first_addr_past_32bits = GuestAddress(FIRST_ADDR_PAST_32BITS); - let end_32bit_gap_start = GuestAddress(MMIO_MEM_START); + let end_32bit_gap_start = GuestAddress(MMIO32_MEM_START); + let first_addr_past_64bits = GuestAddress(FIRST_ADDR_PAST_64BITS_MMIO); + let end_64bit_gap_start = GuestAddress(MMIO64_MEM_START); let himem_start = GuestAddress(layout::HIMEM_START); @@ -371,39 +393,42 @@ fn configure_64bit_boot( layout::SYSTEM_MEM_SIZE, E820_RESERVED, )?; + add_e820_entry( + &mut params, + PCI_MMCONFIG_START, + PCI_MMCONFIG_SIZE, + E820_RESERVED, + )?; let last_addr = guest_mem.last_addr(); - if last_addr < end_32bit_gap_start { + + if last_addr > first_addr_past_64bits { add_e820_entry( &mut params, - himem_start.raw_value(), - // it's safe to use unchecked_offset_from because - // mem_end > himem_start - last_addr.unchecked_offset_from(himem_start) + 1, + first_addr_past_64bits.raw_value(), + last_addr.unchecked_offset_from(first_addr_past_64bits) + 1, E820_RAM, )?; - } else { + } + + if last_addr > first_addr_past_32bits { add_e820_entry( &mut params, - himem_start.raw_value(), - // it's safe to use unchecked_offset_from because - // end_32bit_gap_start > himem_start - end_32bit_gap_start.unchecked_offset_from(himem_start), + first_addr_past_32bits.raw_value(), + (end_64bit_gap_start.unchecked_offset_from(first_addr_past_32bits)) + .min(last_addr.unchecked_offset_from(first_addr_past_32bits) + 1), E820_RAM, )?; - - if last_addr > first_addr_past_32bits { - add_e820_entry( - &mut params, - first_addr_past_32bits.raw_value(), - // it's safe to use unchecked_offset_from because - // mem_end > first_addr_past_32bits - last_addr.unchecked_offset_from(first_addr_past_32bits) + 1, - E820_RAM, - )?; - } } + add_e820_entry( + &mut params, + himem_start.raw_value(), + (last_addr.unchecked_offset_from(himem_start) + 1) + .min(end_32bit_gap_start.unchecked_offset_from(himem_start)), + E820_RAM, + )?; + LinuxBootConfigurator::write_bootparams( &BootParams::new(¶ms, GuestAddress(layout::ZERO_PAGE_START)), guest_mem, @@ -468,51 +493,69 @@ pub fn load_kernel( #[cfg(kani)] mod verification { - use crate::arch::x86_64::FIRST_ADDR_PAST_32BITS; - use crate::arch::{MMIO_MEM_START, arch_memory_regions}; + + use crate::arch::arch_memory_regions; + use crate::arch::x86_64::layout::{ + FIRST_ADDR_PAST_32BITS, FIRST_ADDR_PAST_64BITS_MMIO, MMIO32_MEM_SIZE, MMIO32_MEM_START, + MMIO64_MEM_SIZE, MMIO64_MEM_START, + }; + use crate::utils::u64_to_usize; #[kani::proof] - #[kani::unwind(3)] + #[kani::unwind(4)] fn verify_arch_memory_regions() { - let offset: u64 = kani::any::(); let len: u64 = kani::any::(); kani::assume(len > 0); - kani::assume(offset.checked_add(len).is_some()); - let regions = arch_memory_regions(offset as usize, len as usize); + let regions = arch_memory_regions(len as usize); - // There's only one MMIO gap, so we can get either 1 or 2 regions - assert!(regions.len() <= 2); + // There are two MMIO gaps, so we can get either 1, 2 or 3 regions + assert!(regions.len() <= 3); assert!(regions.len() >= 1); + // The first address is always 0 + assert_eq!(regions[0].0.0, 0); + // The total length of all regions is what we requested - assert_eq!( - regions.iter().map(|&(_, len)| len).sum::(), - len as usize - ); + let actual_size = regions.iter().map(|&(_, len)| len).sum::(); + assert!(actual_size <= len as usize); + if actual_size < u64_to_usize(len) { + assert_eq!( + actual_size, + usize::MAX - u64_to_usize(MMIO32_MEM_SIZE) - u64_to_usize(MMIO64_MEM_SIZE) + ); + } // No region overlaps the MMIO gap assert!( regions .iter() - .all(|&(start, len)| start.0 >= FIRST_ADDR_PAST_32BITS - || start.0 + len as u64 <= MMIO_MEM_START) + .all(|&(start, len)| (start.0 >= FIRST_ADDR_PAST_32BITS + || start.0 + len as u64 <= MMIO32_MEM_START) + && (start.0 >= FIRST_ADDR_PAST_64BITS_MMIO + || start.0 + len as u64 <= MMIO64_MEM_START)) ); - // All regions start after our specified offset - assert!(regions.iter().all(|&(start, _)| start.0 >= offset as u64)); - // All regions have non-zero length assert!(regions.iter().all(|&(_, len)| len > 0)); - // If there's two regions, they perfectly snuggle up to the MMIO gap - if regions.len() == 2 { + // If there's at least two regions, they perfectly snuggle up to one of the two MMIO gaps + if regions.len() >= 2 { kani::cover!(); - assert_eq!(regions[0].0.0 + regions[0].1 as u64, MMIO_MEM_START); + assert_eq!(regions[0].0.0 + regions[0].1 as u64, MMIO32_MEM_START); assert_eq!(regions[1].0.0, FIRST_ADDR_PAST_32BITS); } + + // If there are three regions, the last two perfectly snuggle up to the 64bit + // MMIO gap + if regions.len() == 3 { + kani::cover!(); + + assert_eq!(regions[1].0.0 + regions[1].1 as u64, MMIO64_MEM_START); + assert_eq!(regions[2].0.0, FIRST_ADDR_PAST_64BITS_MMIO); + } } } @@ -523,37 +566,25 @@ mod tests { use super::*; use crate::device_manager::resources::ResourceAllocator; use crate::test_utils::{arch_mem, single_region_mem}; + use crate::utils::mib_to_bytes; #[test] fn regions_lt_4gb() { - let regions = arch_memory_regions(0, 1usize << 29); + let regions = arch_memory_regions(1usize << 29); assert_eq!(1, regions.len()); assert_eq!(GuestAddress(0), regions[0].0); assert_eq!(1usize << 29, regions[0].1); - - let regions = arch_memory_regions(1 << 28, 1 << 29); - assert_eq!(1, regions.len()); - assert_eq!(regions[0], (GuestAddress(1 << 28), 1 << 29)); } #[test] fn regions_gt_4gb() { const MEMORY_SIZE: usize = (1 << 32) + 0x8000; - let regions = arch_memory_regions(0, MEMORY_SIZE); + let regions = arch_memory_regions(MEMORY_SIZE); assert_eq!(2, regions.len()); assert_eq!(GuestAddress(0), regions[0].0); assert_eq!(GuestAddress(1u64 << 32), regions[1].0); - let regions = arch_memory_regions(1 << 31, MEMORY_SIZE); - assert_eq!(2, regions.len()); - assert_eq!( - regions[0], - ( - GuestAddress(1 << 31), - u64_to_usize(MMIO_MEM_START) - (1 << 31) - ) - ); assert_eq!( regions[1], ( diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index aa801b85ed1..3abac133b82 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -1038,8 +1038,8 @@ pub(crate) mod tests { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] assert!(cmdline_contains( &cmdline, - "virtio_mmio.device=4K@0xd0000000:5 virtio_mmio.device=4K@0xd0001000:6 \ - virtio_mmio.device=4K@0xd0002000:7" + "virtio_mmio.device=4K@0xc0001000:5 virtio_mmio.device=4K@0xc0002000:6 \ + virtio_mmio.device=4K@0xc0003000:7" )); } @@ -1137,7 +1137,7 @@ pub(crate) mod tests { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] assert!(cmdline_contains( &cmdline, - "virtio_mmio.device=4K@0xd0000000:5" + "virtio_mmio.device=4K@0xc0001000:5" )); } @@ -1154,7 +1154,7 @@ pub(crate) mod tests { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] assert!(cmdline_contains( &cmdline, - "virtio_mmio.device=4K@0xd0000000:5" + "virtio_mmio.device=4K@0xc0001000:5" )); } @@ -1173,7 +1173,7 @@ pub(crate) mod tests { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] assert!(cmdline_contains( &cmdline, - "virtio_mmio.device=4K@0xd0000000:5" + "virtio_mmio.device=4K@0xc0001000:5" )); } } diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 153f67639db..0ad8116c904 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -20,6 +20,9 @@ use serde::{Deserialize, Serialize}; use vm_allocator::AllocPolicy; use super::resources::ResourceAllocator; +use crate::arch::BOOT_DEVICE_MEM_START; +#[cfg(target_arch = "aarch64")] +use crate::arch::{RTC_MEM_START, SERIAL_MEM_START}; #[cfg(target_arch = "aarch64")] use crate::devices::legacy::{RTCDevice, SerialDevice}; use crate::devices::pseudo::BootTimer; @@ -175,7 +178,7 @@ impl MMIODeviceManager { }; let device_info = MMIODeviceInfo { - addr: resource_allocator.allocate_mmio_memory( + addr: resource_allocator.allocate_32bit_mmio_memory( MMIO_LEN, MMIO_LEN, AllocPolicy::FirstMatch, @@ -292,7 +295,12 @@ impl MMIODeviceManager { let device_info = if let Some(device_info) = device_info_opt { device_info } else { - self.allocate_mmio_resources(resource_allocator, 1)? + let gsi = resource_allocator.allocate_gsi(1)?; + MMIODeviceInfo { + addr: SERIAL_MEM_START, + len: MMIO_LEN, + irq: Some(gsi[0]), + } }; vm.register_irqfd( @@ -347,7 +355,12 @@ impl MMIODeviceManager { let device_info = if let Some(device_info) = device_info_opt { device_info } else { - self.allocate_mmio_resources(resource_allocator, 1)? + let gsi = resource_allocator.allocate_gsi(1)?; + MMIODeviceInfo { + addr: RTC_MEM_START, + len: MMIO_LEN, + irq: Some(gsi[0]), + } }; let device = MMIODevice { @@ -368,11 +381,15 @@ impl MMIODeviceManager { pub fn register_mmio_boot_timer( &mut self, mmio_bus: &vm_device::Bus, - resource_allocator: &mut ResourceAllocator, boot_timer: Arc>, ) -> Result<(), MmioError> { // Attach a new boot timer device. - let device_info = self.allocate_mmio_resources(resource_allocator, 0)?; + let device_info = MMIODeviceInfo { + addr: BOOT_DEVICE_MEM_START, + len: MMIO_LEN, + irq: None, + }; + let device = MMIODevice { resources: device_info, inner: boot_timer, @@ -697,23 +714,17 @@ pub(crate) mod tests { assert!(device_manager.get_virtio_device(0, "foo").is_none()); let dev = device_manager.get_virtio_device(0, "dummy").unwrap(); - assert_eq!(dev.resources.addr, arch::MMIO_MEM_START); + assert_eq!(dev.resources.addr, arch::MEM_32BIT_DEVICES_START); assert_eq!(dev.resources.len, MMIO_LEN); - assert_eq!( - dev.resources.irq, - Some(arch::GSI_BASE) - ); + assert_eq!(dev.resources.irq, Some(arch::GSI_BASE)); device_manager .for_each_virtio_device(|virtio_type, device_id, mmio_device| { assert_eq!(*virtio_type, 0); assert_eq!(device_id, "dummy"); - assert_eq!(mmio_device.resources.addr, arch::MMIO_MEM_START); + assert_eq!(mmio_device.resources.addr, arch::MEM_32BIT_DEVICES_START); assert_eq!(mmio_device.resources.len, MMIO_LEN); - assert_eq!( - mmio_device.resources.irq, - Some(arch::GSI_BASE) - ); + assert_eq!(mmio_device.resources.irq, Some(arch::GSI_BASE)); Ok::<(), ()>(()) }) .unwrap(); diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index 3e3f0f0ffda..f6720233fd1 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -211,11 +211,8 @@ impl DeviceManager { ) -> Result<(), AttachMmioDeviceError> { let boot_timer = Arc::new(Mutex::new(BootTimer::new(request_ts))); - self.mmio_devices.register_mmio_boot_timer( - &self.mmio_bus, - &mut self.resource_allocator, - boot_timer, - )?; + self.mmio_devices + .register_mmio_boot_timer(&self.mmio_bus, boot_timer)?; Ok(()) } diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index c0288b03a59..72e1d6c3d9f 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -415,17 +415,6 @@ impl<'a> Persist<'a> for MMIODeviceManager { .event_manager .add_subscriber(serial.clone()); - constructor_args - .resource_allocator - .allocate_mmio_memory( - MMIO_LEN, - MMIO_LEN, - AllocPolicy::ExactMatch(state.device_info.addr), - ) - .map_err(|e| { - DevicePersistError::DeviceManager(super::mmio::MmioError::Allocator(e)) - })?; - dev_manager.register_mmio_serial( vm, constructor_args.mmio_bus, @@ -436,16 +425,6 @@ impl<'a> Persist<'a> for MMIODeviceManager { } if state.type_ == DeviceType::Rtc { let rtc = Arc::new(Mutex::new(RTCDevice::new())); - constructor_args - .resource_allocator - .allocate_mmio_memory( - MMIO_LEN, - MMIO_LEN, - AllocPolicy::ExactMatch(state.device_info.addr), - ) - .map_err(|e| { - DevicePersistError::DeviceManager(super::mmio::MmioError::Allocator(e)) - })?; dev_manager.register_mmio_rtc( constructor_args.mmio_bus, constructor_args.resource_allocator, @@ -489,7 +468,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { constructor_args .resource_allocator - .allocate_mmio_memory( + .allocate_32bit_mmio_memory( MMIO_LEN, MMIO_LEN, AllocPolicy::ExactMatch(device_info.addr), diff --git a/src/vmm/src/device_manager/resources.rs b/src/vmm/src/device_manager/resources.rs index 719426a1f55..6b6f43637f5 100644 --- a/src/vmm/src/device_manager/resources.rs +++ b/src/vmm/src/device_manager/resources.rs @@ -17,8 +17,10 @@ use crate::arch; pub struct ResourceAllocator { // Allocator for device interrupt lines gsi_allocator: IdAllocator, - // Allocator for memory in the MMIO address space - mmio_memory: AddressAllocator, + // Allocator for memory in the 32-bit MMIO address space + mmio32_memory: AddressAllocator, + // Allocator for memory in the 64-bit MMIO address space + mmio64_memory: AddressAllocator, // Memory allocator for system data system_memory: AddressAllocator, } @@ -28,7 +30,14 @@ impl ResourceAllocator { pub fn new() -> Result { Ok(Self { gsi_allocator: IdAllocator::new(arch::GSI_BASE, arch::GSI_MAX)?, - mmio_memory: AddressAllocator::new(arch::MMIO_MEM_START, arch::MMIO_MEM_SIZE)?, + mmio32_memory: AddressAllocator::new( + arch::MEM_32BIT_DEVICES_START, + arch::MEM_32BIT_DEVICES_SIZE, + )?, + mmio64_memory: AddressAllocator::new( + arch::MEM_64BIT_DEVICES_START, + arch::MEM_64BIT_DEVICES_SIZE, + )?, system_memory: AddressAllocator::new(arch::SYSTEM_MEM_START, arch::SYSTEM_MEM_SIZE)?, }) } @@ -57,7 +66,7 @@ impl ResourceAllocator { Ok(gsis) } - /// Allocate a memory range in MMIO address space + /// Allocate a memory range in 32-bit MMIO address space /// /// If it succeeds, it returns the first address of the allocated range /// @@ -66,13 +75,37 @@ impl ResourceAllocator { /// * `size` - The size in bytes of the memory to allocate /// * `alignment` - The alignment of the address of the first byte /// * `policy` - A [`vm_allocator::AllocPolicy`] variant for determining the allocation policy - pub fn allocate_mmio_memory( + pub fn allocate_32bit_mmio_memory( &mut self, size: u64, alignment: u64, policy: AllocPolicy, ) -> Result { - Ok(self.mmio_memory.allocate(size, alignment, policy)?.start()) + Ok(self + .mmio32_memory + .allocate(size, alignment, policy)? + .start()) + } + + /// Allocate a memory range in 64-bit MMIO address space + /// + /// If it succeeds, it returns the first address of the allocated range + /// + /// # Arguments + /// + /// * `size` - The size in bytes of the memory to allocate + /// * `alignment` - The alignment of the address of the first byte + /// * `policy` - A [`vm_allocator::AllocPolicy`] variant for determining the allocation policy + pub fn allocate_64bit_mmio_memory( + &mut self, + size: u64, + alignment: u64, + policy: AllocPolicy, + ) -> Result { + Ok(self + .mmio64_memory + .allocate(size, alignment, policy)? + .start()) } /// Allocate a memory range for system data diff --git a/src/vmm/src/devices/virtio/vsock/event_handler.rs b/src/vmm/src/devices/virtio/vsock/event_handler.rs index a54998ba808..b4445e298ae 100755 --- a/src/vmm/src/devices/virtio/vsock/event_handler.rs +++ b/src/vmm/src/devices/virtio/vsock/event_handler.rs @@ -481,8 +481,8 @@ mod tests { #[cfg(target_arch = "x86_64")] #[allow(clippy::cast_possible_truncation)] /* casting of constants we know fit into u32 */ fn test_vsock_bof() { - use crate::arch::MMIO_MEM_START; - use crate::arch::x86_64::{FIRST_ADDR_PAST_32BITS, MEM_32BIT_GAP_SIZE}; + use crate::arch::x86_64::layout::FIRST_ADDR_PAST_32BITS; + use crate::arch::{MMIO32_MEM_SIZE, MMIO32_MEM_START}; use crate::devices::virtio::vsock::packet::VSOCK_PKT_HDR_SIZE; use crate::test_utils::multi_region_mem; use crate::utils::mib_to_bytes; @@ -493,7 +493,7 @@ mod tests { let mut test_ctx = TestContext::new(); test_ctx.mem = multi_region_mem(&[ (GuestAddress(0), 8 * MIB), - (GuestAddress(MMIO_MEM_START - MIB as u64), MIB), + (GuestAddress(MMIO32_MEM_START - MIB as u64), MIB), (GuestAddress(FIRST_ADDR_PAST_32BITS), MIB), ]); @@ -516,15 +516,15 @@ mod tests { } // Let's check what happens when the header descriptor is right before the gap. - vsock_bof_helper(&mut test_ctx, 0, MMIO_MEM_START - 1, VSOCK_PKT_HDR_SIZE); + vsock_bof_helper(&mut test_ctx, 0, MMIO32_MEM_START - 1, VSOCK_PKT_HDR_SIZE); // Let's check what happens when the buffer descriptor crosses into the gap, but does // not go past its right edge. vsock_bof_helper( &mut test_ctx, 1, - MMIO_MEM_START - 4, - MEM_32BIT_GAP_SIZE as u32 + 4, + MMIO32_MEM_START - 4, + MMIO32_MEM_SIZE as u32 + 4, ); // Let's modify the buffer descriptor addr and len such that it crosses over the MMIO gap, @@ -532,8 +532,8 @@ mod tests { vsock_bof_helper( &mut test_ctx, 1, - MMIO_MEM_START - 4, - MEM_32BIT_GAP_SIZE as u32 + 100, + MMIO32_MEM_START - 4, + MMIO32_MEM_SIZE as u32 + 100, ); } diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index 00199fd1fe2..1e2bd803e1d 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -473,7 +473,7 @@ impl VmResources { // a single way of backing guest memory for vhost-user and non-vhost-user cases, // that would not be worth the effort. let regions = - crate::arch::arch_memory_regions(0, mib_to_bytes(self.machine_config.mem_size_mib)); + crate::arch::arch_memory_regions(mib_to_bytes(self.machine_config.mem_size_mib)); if vhost_user_device_used { memory::memfd_backed( regions.as_ref(), diff --git a/src/vmm/src/test_utils/mod.rs b/src/vmm/src/test_utils/mod.rs index ae2c4a9bd3b..fb936fe7659 100644 --- a/src/vmm/src/test_utils/mod.rs +++ b/src/vmm/src/test_utils/mod.rs @@ -58,11 +58,11 @@ pub fn multi_region_mem_raw(regions: &[(GuestAddress, usize)]) -> Vec GuestMemoryMmap { - multi_region_mem(&crate::arch::arch_memory_regions(0, mem_size_bytes)) + multi_region_mem(&crate::arch::arch_memory_regions(mem_size_bytes)) } pub fn arch_mem_raw(mem_size_bytes: usize) -> Vec { - multi_region_mem_raw(&crate::arch::arch_memory_regions(0, mem_size_bytes)) + multi_region_mem_raw(&crate::arch::arch_memory_regions(mem_size_bytes)) } pub fn create_vmm( From 352e2eaee613bed7a801e96afc1d69f6c8a4efa4 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 6 May 2025 13:00:48 +0200 Subject: [PATCH 16/99] refactor: prepare ResourceAllocator for PCIe devices PCIe devices need some times to relocate themselves in memory. To do so, they need to keep an (atomic) reference to a type that implements `DeviceRelocation` trait. The logic for relocation involves removing the device from the bus it has been registered to, allocate a new address range for it and reinsert it. Instead of creating a new type for it, reuse `ResourceAllocator`. This means that we need to move the buses from the `DeviceManager` inside `ResourceAllocator`. Signed-off-by: Babis Chalios --- src/vmm/src/acpi/mod.rs | 39 +++++++------- src/vmm/src/arch/aarch64/fdt.rs | 2 +- src/vmm/src/arch/x86_64/mod.rs | 18 +++---- src/vmm/src/arch/x86_64/mptable.rs | 30 +++++------ src/vmm/src/device_manager/mmio.rs | 49 ++++++++--------- src/vmm/src/device_manager/mod.rs | 56 +++++-------------- src/vmm/src/device_manager/persist.rs | 22 ++++---- src/vmm/src/device_manager/resources.rs | 71 ++++++++++++++++++------- src/vmm/src/devices/acpi/vmgenid.rs | 4 +- src/vmm/src/lib.rs | 4 +- 10 files changed, 146 insertions(+), 149 deletions(-) diff --git a/src/vmm/src/acpi/mod.rs b/src/vmm/src/acpi/mod.rs index 542e53409b7..a9b9b2bfb28 100644 --- a/src/vmm/src/acpi/mod.rs +++ b/src/vmm/src/acpi/mod.rs @@ -53,7 +53,7 @@ impl AcpiTableWriter<'_> { /// buffer. It returns the address in which it wrote the table. fn write_acpi_table( &mut self, - resource_allocator: &mut ResourceAllocator, + resource_allocator: &ResourceAllocator, table: &mut S, ) -> Result where @@ -94,7 +94,7 @@ impl AcpiTableWriter<'_> { setup_arch_dsdt(&mut dsdt_data)?; let mut dsdt = Dsdt::new(OEM_ID, *b"FCVMDSDT", OEM_REVISION, dsdt_data); - self.write_acpi_table(&mut device_manager.resource_allocator, &mut dsdt) + self.write_acpi_table(&device_manager.resource_allocator, &mut dsdt) } /// Build the FADT table for the guest @@ -102,7 +102,7 @@ impl AcpiTableWriter<'_> { /// This includes a pointer with the location of the DSDT in guest memory fn build_fadt( &mut self, - resource_allocator: &mut ResourceAllocator, + resource_allocator: &ResourceAllocator, dsdt_addr: u64, ) -> Result { let mut fadt = Fadt::new(OEM_ID, *b"FCVMFADT", OEM_REVISION); @@ -120,7 +120,7 @@ impl AcpiTableWriter<'_> { /// This includes information about the interrupt controllers supported in the platform fn build_madt( &mut self, - resource_allocator: &mut ResourceAllocator, + resource_allocator: &ResourceAllocator, nr_vcpus: u8, ) -> Result { let mut madt = Madt::new( @@ -138,7 +138,7 @@ impl AcpiTableWriter<'_> { /// Currently, we pass to the guest just FADT and MADT tables. fn build_xsdt( &mut self, - resource_allocator: &mut ResourceAllocator, + resource_allocator: &ResourceAllocator, fadt_addr: u64, madt_addr: u64, ) -> Result { @@ -180,15 +180,14 @@ pub(crate) fn create_acpi_tables( vcpus: &[Vcpu], ) -> Result<(), AcpiError> { let mut writer = AcpiTableWriter { mem }; - let dsdt_addr = writer.build_dsdt(device_manager)?; - let fadt_addr = writer.build_fadt(&mut device_manager.resource_allocator, dsdt_addr)?; + + let fadt_addr = writer.build_fadt(&device_manager.resource_allocator, dsdt_addr)?; let madt_addr = writer.build_madt( - &mut device_manager.resource_allocator, + &device_manager.resource_allocator, vcpus.len().try_into().unwrap(), )?; - let xsdt_addr = - writer.build_xsdt(&mut device_manager.resource_allocator, fadt_addr, madt_addr)?; + let xsdt_addr = writer.build_xsdt(&device_manager.resource_allocator, fadt_addr, madt_addr)?; writer.build_rsdp(xsdt_addr) } @@ -227,7 +226,7 @@ mod tests { #[test] fn test_write_acpi_table_memory_allocation() { // A mocke Vmm object with 128MBs of memory - let mut vmm = default_vmm(); + let vmm = default_vmm(); let mut writer = AcpiTableWriter { mem: vmm.vm.guest_memory(), }; @@ -235,14 +234,14 @@ mod tests { // This should succeed let mut sdt = MockSdt(vec![0; 4096]); let addr = writer - .write_acpi_table(&mut vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START); // Let's try to write two 4K pages plus one byte let mut sdt = MockSdt(vec![0; usize::try_from(SYSTEM_MEM_SIZE + 1).unwrap()]); let err = writer - .write_acpi_table(&mut vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) .unwrap_err(); assert!( matches!( @@ -257,27 +256,27 @@ mod tests { // succeed. let mut sdt = MockSdt(vec![0; 5]); let addr = writer - .write_acpi_table(&mut vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4096); let mut sdt = MockSdt(vec![0; 2]); let addr = writer - .write_acpi_table(&mut vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4101); let mut sdt = MockSdt(vec![0; 4]); let addr = writer - .write_acpi_table(&mut vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4103); let mut sdt = MockSdt(vec![0; 8]); let addr = writer - .write_acpi_table(&mut vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4107); let mut sdt = MockSdt(vec![0; 16]); let addr = writer - .write_acpi_table(&mut vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4115); } @@ -294,11 +293,11 @@ mod tests { let mut writer = AcpiTableWriter { mem: vm.guest_memory(), }; - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); let mut sdt = MockSdt(vec![0; usize::try_from(SYSTEM_MEM_SIZE).unwrap()]); let err = writer - .write_acpi_table(&mut resource_allocator, &mut sdt) + .write_acpi_table(&resource_allocator, &mut sdt) .unwrap_err(); assert!( matches!( diff --git a/src/vmm/src/arch/aarch64/fdt.rs b/src/vmm/src/arch/aarch64/fdt.rs index 131be4b2e31..e2a42dd2982 100644 --- a/src/vmm/src/arch/aarch64/fdt.rs +++ b/src/vmm/src/arch/aarch64/fdt.rs @@ -477,7 +477,7 @@ mod tests { .register_virtio_test_device( &vm, mem.clone(), - &mut device_manager.resource_allocator, + &device_manager.resource_allocator, dummy, &mut cmdline, "dummy", diff --git a/src/vmm/src/arch/x86_64/mod.rs b/src/vmm/src/arch/x86_64/mod.rs index add5bd52dd7..fe1296e5d1c 100644 --- a/src/vmm/src/arch/x86_64/mod.rs +++ b/src/vmm/src/arch/x86_64/mod.rs @@ -213,7 +213,7 @@ pub fn configure_system_for_boot( // Note that this puts the mptable at the last 1k of Linux's 640k base RAM mptable::setup_mptable( vmm.vm.guest_memory(), - &mut vmm.device_manager.resource_allocator, + &vmm.device_manager.resource_allocator, vcpu_config.vcpu_count, ) .map_err(ConfigurationError::MpTableSetup)?; @@ -598,8 +598,8 @@ mod tests { fn test_system_configuration() { let no_vcpus = 4; let gm = single_region_mem(0x10000); - let mut resource_allocator = ResourceAllocator::new().unwrap(); - let err = mptable::setup_mptable(&gm, &mut resource_allocator, 1); + let resource_allocator = ResourceAllocator::new().unwrap(); + let err = mptable::setup_mptable(&gm, &resource_allocator, 1); assert!(matches!( err.unwrap_err(), mptable::MptableError::NotEnoughMemory @@ -608,24 +608,24 @@ mod tests { // Now assigning some memory that falls before the 32bit memory hole. let mem_size = mib_to_bytes(128); let gm = arch_mem(mem_size); - let mut resource_allocator = ResourceAllocator::new().unwrap(); - mptable::setup_mptable(&gm, &mut resource_allocator, no_vcpus).unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); + mptable::setup_mptable(&gm, &resource_allocator, no_vcpus).unwrap(); configure_64bit_boot(&gm, GuestAddress(0), 0, &None).unwrap(); configure_pvh(&gm, GuestAddress(0), &None).unwrap(); // Now assigning some memory that is equal to the start of the 32bit memory hole. let mem_size = mib_to_bytes(3328); let gm = arch_mem(mem_size); - let mut resource_allocator = ResourceAllocator::new().unwrap(); - mptable::setup_mptable(&gm, &mut resource_allocator, no_vcpus).unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); + mptable::setup_mptable(&gm, &resource_allocator, no_vcpus).unwrap(); configure_64bit_boot(&gm, GuestAddress(0), 0, &None).unwrap(); configure_pvh(&gm, GuestAddress(0), &None).unwrap(); // Now assigning some memory that falls after the 32bit memory hole. let mem_size = mib_to_bytes(3330); let gm = arch_mem(mem_size); - let mut resource_allocator = ResourceAllocator::new().unwrap(); - mptable::setup_mptable(&gm, &mut resource_allocator, no_vcpus).unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); + mptable::setup_mptable(&gm, &resource_allocator, no_vcpus).unwrap(); configure_64bit_boot(&gm, GuestAddress(0), 0, &None).unwrap(); configure_pvh(&gm, GuestAddress(0), &None).unwrap(); } diff --git a/src/vmm/src/arch/x86_64/mptable.rs b/src/vmm/src/arch/x86_64/mptable.rs index 6646c17e282..c397290c23e 100644 --- a/src/vmm/src/arch/x86_64/mptable.rs +++ b/src/vmm/src/arch/x86_64/mptable.rs @@ -116,7 +116,7 @@ fn compute_mp_size(num_cpus: u8) -> usize { /// Performs setup of the MP table for the given `num_cpus`. pub fn setup_mptable( mem: &GuestMemoryMmap, - resource_allocator: &mut ResourceAllocator, + resource_allocator: &ResourceAllocator, num_cpus: u8, ) -> Result<(), MptableError> { if num_cpus > MAX_SUPPORTED_CPUS { @@ -334,27 +334,27 @@ mod tests { fn bounds_check() { let num_cpus = 4; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(num_cpus)); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); - setup_mptable(&mem, &mut resource_allocator, num_cpus).unwrap(); + setup_mptable(&mem, &resource_allocator, num_cpus).unwrap(); } #[test] fn bounds_check_fails() { let num_cpus = 4; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(num_cpus) - 1); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); - setup_mptable(&mem, &mut resource_allocator, num_cpus).unwrap_err(); + setup_mptable(&mem, &resource_allocator, num_cpus).unwrap_err(); } #[test] fn mpf_intel_checksum() { let num_cpus = 1; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(num_cpus)); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); - setup_mptable(&mem, &mut resource_allocator, num_cpus).unwrap(); + setup_mptable(&mem, &resource_allocator, num_cpus).unwrap(); let mpf_intel: mpspec::mpf_intel = mem.read_obj(GuestAddress(SYSTEM_MEM_START)).unwrap(); @@ -365,9 +365,9 @@ mod tests { fn mpc_table_checksum() { let num_cpus = 4; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(num_cpus)); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); - setup_mptable(&mem, &mut resource_allocator, num_cpus).unwrap(); + setup_mptable(&mem, &resource_allocator, num_cpus).unwrap(); let mpf_intel: mpspec::mpf_intel = mem.read_obj(GuestAddress(SYSTEM_MEM_START)).unwrap(); let mpc_offset = GuestAddress(u64::from(mpf_intel.physptr)); @@ -388,9 +388,9 @@ mod tests { fn mpc_entry_count() { let num_cpus = 1; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(num_cpus)); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); - setup_mptable(&mem, &mut resource_allocator, num_cpus).unwrap(); + setup_mptable(&mem, &resource_allocator, num_cpus).unwrap(); let mpf_intel: mpspec::mpf_intel = mem.read_obj(GuestAddress(SYSTEM_MEM_START)).unwrap(); let mpc_offset = GuestAddress(u64::from(mpf_intel.physptr)); @@ -419,8 +419,8 @@ mod tests { let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(MAX_SUPPORTED_CPUS)); for i in 0..MAX_SUPPORTED_CPUS { - let mut resource_allocator = ResourceAllocator::new().unwrap(); - setup_mptable(&mem, &mut resource_allocator, i).unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); + setup_mptable(&mem, &resource_allocator, i).unwrap(); let mpf_intel: mpspec::mpf_intel = mem.read_obj(GuestAddress(SYSTEM_MEM_START)).unwrap(); @@ -450,9 +450,9 @@ mod tests { fn cpu_entry_count_max() { let cpus = MAX_SUPPORTED_CPUS + 1; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(cpus)); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); - let result = setup_mptable(&mem, &mut resource_allocator, cpus).unwrap_err(); + let result = setup_mptable(&mem, &resource_allocator, cpus).unwrap_err(); assert_eq!(result, MptableError::TooManyCpus); } } diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 0ad8116c904..1fd21195803 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -168,7 +168,7 @@ impl MMIODeviceManager { /// Allocates resources for a new device to be added. fn allocate_mmio_resources( &mut self, - resource_allocator: &mut ResourceAllocator, + resource_allocator: &ResourceAllocator, irq_count: u32, ) -> Result { let irq = match resource_allocator.allocate_gsi(irq_count)?[..] { @@ -252,9 +252,8 @@ impl MMIODeviceManager { pub fn register_mmio_virtio_for_boot( &mut self, vm: &VmFd, - resource_allocator: &mut ResourceAllocator, + resource_allocator: &ResourceAllocator, device_id: String, - mmio_bus: &vm_device::Bus, mmio_device: MmioTransport, _cmdline: &mut kernel_cmdline::Cmdline, ) -> Result<(), MmioError> { @@ -275,7 +274,7 @@ impl MMIODeviceManager { device.resources.irq.unwrap(), )?; } - self.register_mmio_virtio(vm, device_id, mmio_bus, device)?; + self.register_mmio_virtio(vm, device_id, &resource_allocator.mmio_bus, device)?; Ok(()) } @@ -285,8 +284,7 @@ impl MMIODeviceManager { pub fn register_mmio_serial( &mut self, vm: &VmFd, - mmio_bus: &vm_device::Bus, - resource_allocator: &mut ResourceAllocator, + resource_allocator: &ResourceAllocator, serial: Arc>, device_info_opt: Option, ) -> Result<(), MmioError> { @@ -314,7 +312,7 @@ impl MMIODeviceManager { inner: serial, }; - mmio_bus.insert( + resource_allocator.mmio_bus.insert( device.inner.clone(), device.resources.addr, device.resources.len, @@ -345,8 +343,7 @@ impl MMIODeviceManager { /// given as parameter, otherwise allocate a new MMIO resources for it. pub fn register_mmio_rtc( &mut self, - mmio_bus: &vm_device::Bus, - resource_allocator: &mut ResourceAllocator, + resource_allocator: &ResourceAllocator, rtc: Arc>, device_info_opt: Option, ) -> Result<(), MmioError> { @@ -368,7 +365,7 @@ impl MMIODeviceManager { inner: rtc, }; - mmio_bus.insert( + resource_allocator.mmio_bus.insert( device.inner.clone(), device.resources.addr, device.resources.len, @@ -570,19 +567,17 @@ pub(crate) mod tests { &mut self, vm: &VmFd, guest_mem: GuestMemoryMmap, - resource_allocator: &mut ResourceAllocator, + resource_allocator: &ResourceAllocator, device: Arc>, cmdline: &mut kernel_cmdline::Cmdline, dev_id: &str, ) -> Result { let interrupt = Arc::new(IrqTrigger::new()); - let mmio_bus = vm_device::Bus::new(); let mmio_device = MmioTransport::new(guest_mem, interrupt, device.clone(), false); self.register_mmio_virtio_for_boot( vm, resource_allocator, dev_id.to_string(), - &mmio_bus, mmio_device, cmdline, )?; @@ -692,7 +687,7 @@ pub(crate) mod tests { let mut vm = Vm::new(&kvm).unwrap(); vm.register_memory_regions(guest_mem).unwrap(); let mut device_manager = MMIODeviceManager::new(); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); let dummy = Arc::new(Mutex::new(DummyDevice::new())); @@ -705,7 +700,7 @@ pub(crate) mod tests { .register_virtio_test_device( vm.fd(), vm.guest_memory().clone(), - &mut resource_allocator, + &resource_allocator, dummy, &mut cmdline, "dummy", @@ -740,7 +735,7 @@ pub(crate) mod tests { let mut vm = Vm::new(&kvm).unwrap(); vm.register_memory_regions(guest_mem).unwrap(); let mut device_manager = MMIODeviceManager::new(); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); #[cfg(target_arch = "x86_64")] @@ -753,7 +748,7 @@ pub(crate) mod tests { .register_virtio_test_device( vm.fd(), vm.guest_memory().clone(), - &mut resource_allocator, + &resource_allocator, Arc::new(Mutex::new(DummyDevice::new())), &mut cmdline, "dummy1", @@ -767,7 +762,7 @@ pub(crate) mod tests { .register_virtio_test_device( vm.fd(), vm.guest_memory().clone(), - &mut resource_allocator, + &resource_allocator, Arc::new(Mutex::new(DummyDevice::new())), &mut cmdline, "dummy2" @@ -802,7 +797,7 @@ pub(crate) mod tests { vm.setup_irqchip(1).unwrap(); let mut device_manager = MMIODeviceManager::new(); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); let dummy = Arc::new(Mutex::new(DummyDevice::new())); @@ -812,7 +807,7 @@ pub(crate) mod tests { .register_virtio_test_device( vm.fd(), vm.guest_memory().clone(), - &mut resource_allocator, + &resource_allocator, dummy, &mut cmdline, &id, @@ -842,7 +837,7 @@ pub(crate) mod tests { .register_virtio_test_device( vm.fd(), vm.guest_memory().clone(), - &mut resource_allocator, + &resource_allocator, dummy2, &mut cmdline, &id2, @@ -868,10 +863,10 @@ pub(crate) mod tests { #[test] fn test_no_irq_allocation() { let mut device_manager = MMIODeviceManager::new(); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); let device_info = device_manager - .allocate_mmio_resources(&mut resource_allocator, 0) + .allocate_mmio_resources(&resource_allocator, 0) .unwrap(); assert!(device_info.irq.is_none()); } @@ -879,10 +874,10 @@ pub(crate) mod tests { #[test] fn test_irq_allocation() { let mut device_manager = MMIODeviceManager::new(); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); let device_info = device_manager - .allocate_mmio_resources(&mut resource_allocator, 1) + .allocate_mmio_resources(&resource_allocator, 1) .unwrap(); assert_eq!(device_info.irq.unwrap(), crate::arch::GSI_BASE); } @@ -890,12 +885,12 @@ pub(crate) mod tests { #[test] fn test_allocation_failure() { let mut device_manager = MMIODeviceManager::new(); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); assert_eq!( format!( "{}", device_manager - .allocate_mmio_resources(&mut resource_allocator, 2) + .allocate_mmio_resources(&resource_allocator, 2) .unwrap_err() ), "Invalid MMIO IRQ configuration.".to_string() diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index f6720233fd1..bcc71236c63 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -96,15 +96,10 @@ pub enum AttachLegacyMmioDeviceError { /// A manager of all peripheral devices of Firecracker pub struct DeviceManager { /// Allocator for system memory and interrupt numbers - pub resource_allocator: ResourceAllocator, - /// MMIO bus - pub mmio_bus: Arc, + pub resource_allocator: Arc, /// MMIO devices pub mmio_devices: MMIODeviceManager, #[cfg(target_arch = "x86_64")] - /// Port IO bus - pub pio_bus: Arc, - #[cfg(target_arch = "x86_64")] /// Legacy devices pub legacy_devices: PortIODeviceManager, /// ACPI devices @@ -145,10 +140,7 @@ impl DeviceManager { vcpu_exit_evt: &EventFd, vmfd: &VmFd, ) -> Result { - let mmio_bus = Arc::new(vm_device::Bus::new()); - - #[cfg(target_arch = "x86_64")] - let pio_bus = Arc::new(vm_device::Bus::new()); + let resource_allocator = Arc::new(ResourceAllocator::new()?); #[cfg(target_arch = "x86_64")] let legacy_devices = { Self::set_stdout_nonblocking(); @@ -163,17 +155,14 @@ impl DeviceManager { // create pio dev manager with legacy devices let mut legacy_devices = PortIODeviceManager::new(serial, i8042)?; - legacy_devices.register_devices(&pio_bus, vmfd)?; + legacy_devices.register_devices(&resource_allocator.pio_bus, vmfd)?; legacy_devices }; Ok(DeviceManager { - resource_allocator: ResourceAllocator::new()?, - mmio_bus, + resource_allocator, mmio_devices: MMIODeviceManager::new(), #[cfg(target_arch = "x86_64")] - pio_bus, - #[cfg(target_arch = "x86_64")] legacy_devices, acpi_devices: ACPIDeviceManager::new(), }) @@ -194,9 +183,8 @@ impl DeviceManager { let device = MmioTransport::new(mem.clone(), interrupt, device, is_vhost_user); self.mmio_devices.register_mmio_virtio_for_boot( vmfd, - &mut self.resource_allocator, + &self.resource_allocator, id, - &self.mmio_bus, device, cmdline, )?; @@ -212,7 +200,7 @@ impl DeviceManager { let boot_timer = Arc::new(Mutex::new(BootTimer::new(request_ts))); self.mmio_devices - .register_mmio_boot_timer(&self.mmio_bus, boot_timer)?; + .register_mmio_boot_timer(&self.resource_allocator.mmio_bus, boot_timer)?; Ok(()) } @@ -222,7 +210,7 @@ impl DeviceManager { mem: &GuestMemoryMmap, vmfd: &VmFd, ) -> Result<(), AttachVmgenidError> { - let vmgenid = VmGenId::new(mem, &mut self.resource_allocator)?; + let vmgenid = VmGenId::new(mem, &self.resource_allocator)?; self.acpi_devices.attach_vmgenid(vmgenid, vmfd)?; Ok(()) } @@ -246,23 +234,14 @@ impl DeviceManager { // Make stdout non-blocking. Self::set_stdout_nonblocking(); let serial = Self::setup_serial_device(event_manager)?; - self.mmio_devices.register_mmio_serial( - vmfd, - &self.mmio_bus, - &mut self.resource_allocator, - serial, - None, - )?; + self.mmio_devices + .register_mmio_serial(vmfd, &self.resource_allocator, serial, None)?; self.mmio_devices.add_mmio_serial_to_cmdline(cmdline)?; } let rtc = Arc::new(Mutex::new(RTCDevice::new())); - self.mmio_devices.register_mmio_rtc( - &self.mmio_bus, - &mut self.resource_allocator, - rtc, - None, - )?; + self.mmio_devices + .register_mmio_rtc(&self.resource_allocator, rtc, None)?; Ok(()) } } @@ -352,11 +331,10 @@ impl DeviceManager { ) -> Result<(), DevicePersistError> { // Restore MMIO devices let mmio_ctor_args = MMIODevManagerConstructorArgs { - mmio_bus: &self.mmio_bus, mem: restore_args.mem, vm: restore_args.vm, event_manager: restore_args.event_manager, - resource_allocator: &mut self.resource_allocator, + resource_allocator: &self.resource_allocator, vm_resources: restore_args.vm_resources, instance_id: restore_args.instance_id, restored_from_file: restore_args.restored_from_file, @@ -370,7 +348,7 @@ impl DeviceManager { // Restore ACPI devices let acpi_ctor_args = ACPIDeviceManagerConstructorArgs { mem: restore_args.mem, - resource_allocator: &mut self.resource_allocator, + resource_allocator: &self.resource_allocator, vm: restore_args.vm, }; self.acpi_devices = ACPIDeviceManager::restore(acpi_ctor_args, &state.acpi_state)?; @@ -387,12 +365,9 @@ pub(crate) mod tests { use crate::builder::tests::default_vmm; pub(crate) fn default_device_manager() -> DeviceManager { - let mmio_bus = Arc::new(vm_device::Bus::new()); - #[cfg(target_arch = "x86_64")] - let pio_bus = Arc::new(vm_device::Bus::new()); let mmio_devices = MMIODeviceManager::new(); let acpi_devices = ACPIDeviceManager::new(); - let resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = Arc::new(ResourceAllocator::new().unwrap()); #[cfg(target_arch = "x86_64")] let legacy_devices = PortIODeviceManager::new( @@ -407,11 +382,8 @@ pub(crate) mod tests { DeviceManager { resource_allocator, - mmio_bus, mmio_devices, #[cfg(target_arch = "x86_64")] - pio_bus, - #[cfg(target_arch = "x86_64")] legacy_devices, acpi_devices, } diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 72e1d6c3d9f..e0b1baf381e 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -195,11 +195,10 @@ pub enum SharedDeviceType { } pub struct MMIODevManagerConstructorArgs<'a> { - pub mmio_bus: &'a vm_device::Bus, pub mem: &'a GuestMemoryMmap, pub vm: &'a VmFd, pub event_manager: &'a mut EventManager, - pub resource_allocator: &'a mut ResourceAllocator, + pub resource_allocator: &'a ResourceAllocator, pub vm_resources: &'a mut VmResources, pub instance_id: &'a str, pub restored_from_file: bool, @@ -224,7 +223,7 @@ pub struct ACPIDeviceManagerState { pub struct ACPIDeviceManagerConstructorArgs<'a> { pub mem: &'a GuestMemoryMmap, - pub resource_allocator: &'a mut ResourceAllocator, + pub resource_allocator: &'a ResourceAllocator, pub vm: &'a VmFd, } @@ -417,7 +416,6 @@ impl<'a> Persist<'a> for MMIODeviceManager { dev_manager.register_mmio_serial( vm, - constructor_args.mmio_bus, constructor_args.resource_allocator, serial, Some(state.device_info), @@ -426,7 +424,6 @@ impl<'a> Persist<'a> for MMIODeviceManager { if state.type_ == DeviceType::Rtc { let rtc = Arc::new(Mutex::new(RTCDevice::new())); dev_manager.register_mmio_rtc( - constructor_args.mmio_bus, constructor_args.resource_allocator, rtc, Some(state.device_info), @@ -514,7 +511,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { &balloon_state.transport_state, interrupt, &balloon_state.device_info, - constructor_args.mmio_bus, + &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, )?; } @@ -541,7 +538,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { &block_state.transport_state, interrupt, &block_state.device_info, - constructor_args.mmio_bus, + &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, )?; } @@ -583,7 +580,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { &net_state.transport_state, interrupt, &net_state.device_info, - constructor_args.mmio_bus, + &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, )?; } @@ -615,7 +612,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { &vsock_state.transport_state, interrupt, &vsock_state.device_info, - constructor_args.mmio_bus, + &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, )?; } @@ -641,7 +638,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { &entropy_state.transport_state, interrupt, &entropy_state.device_info, - constructor_args.mmio_bus, + &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, )?; } @@ -730,7 +727,7 @@ mod tests { // These need to survive so the restored blocks find them. let _block_files; let mut tmp_sock_file = TempFile::new().unwrap(); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); tmp_sock_file.remove().unwrap(); // Set up a vmm with one of each device, and get the serialized DeviceStates. { @@ -793,11 +790,10 @@ mod tests { let device_states: DeviceStates = Snapshot::deserialize(&mut buf.as_slice()).unwrap(); let vm_resources = &mut VmResources::default(); let restore_args = MMIODevManagerConstructorArgs { - mmio_bus: &vmm.device_manager.mmio_bus, mem: vmm.vm.guest_memory(), vm: vmm.vm.fd(), event_manager: &mut event_manager, - resource_allocator: &mut resource_allocator, + resource_allocator: &resource_allocator, vm_resources, instance_id: "microvm-id", restored_from_file: true, diff --git a/src/vmm/src/device_manager/resources.rs b/src/vmm/src/device_manager/resources.rs index 6b6f43637f5..2a93c7fd17f 100644 --- a/src/vmm/src/device_manager/resources.rs +++ b/src/vmm/src/device_manager/resources.rs @@ -1,8 +1,12 @@ // Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +use std::sync::{Arc, Mutex}; + +use pci::DeviceRelocation; pub use vm_allocator::AllocPolicy; use vm_allocator::{AddressAllocator, IdAllocator}; +use vm_device::Bus; use crate::arch; @@ -16,29 +20,40 @@ use crate::arch; #[derive(Debug)] pub struct ResourceAllocator { // Allocator for device interrupt lines - gsi_allocator: IdAllocator, + pub gsi_allocator: Arc>, // Allocator for memory in the 32-bit MMIO address space - mmio32_memory: AddressAllocator, + pub mmio32_memory: Arc>, // Allocator for memory in the 64-bit MMIO address space - mmio64_memory: AddressAllocator, + pub mmio64_memory: Arc>, // Memory allocator for system data - system_memory: AddressAllocator, + pub system_memory: Arc>, + /// MMIO bus + pub mmio_bus: Arc, + #[cfg(target_arch = "x86_64")] + /// Port IO bus + pub pio_bus: Arc, } impl ResourceAllocator { /// Create a new resource allocator for Firecracker devices pub fn new() -> Result { Ok(Self { - gsi_allocator: IdAllocator::new(arch::GSI_BASE, arch::GSI_MAX)?, - mmio32_memory: AddressAllocator::new( + gsi_allocator: Arc::new(Mutex::new(IdAllocator::new(arch::GSI_BASE, arch::GSI_MAX)?)), + mmio32_memory: Arc::new(Mutex::new(AddressAllocator::new( arch::MEM_32BIT_DEVICES_START, arch::MEM_32BIT_DEVICES_SIZE, - )?, - mmio64_memory: AddressAllocator::new( + )?)), + mmio64_memory: Arc::new(Mutex::new(AddressAllocator::new( arch::MEM_64BIT_DEVICES_START, arch::MEM_64BIT_DEVICES_SIZE, - )?, - system_memory: AddressAllocator::new(arch::SYSTEM_MEM_START, arch::SYSTEM_MEM_SIZE)?, + )?)), + system_memory: Arc::new(Mutex::new(AddressAllocator::new( + arch::SYSTEM_MEM_START, + arch::SYSTEM_MEM_SIZE, + )?)), + mmio_bus: Arc::new(Bus::new()), + #[cfg(target_arch = "x86_64")] + pio_bus: Arc::new(Bus::new()), }) } @@ -47,16 +62,17 @@ impl ResourceAllocator { /// # Arguments /// /// * `gsi_count` - The number of GSIs to allocate - pub fn allocate_gsi(&mut self, gsi_count: u32) -> Result, vm_allocator::Error> { + pub fn allocate_gsi(&self, gsi_count: u32) -> Result, vm_allocator::Error> { + let mut gsi_allocator = self.gsi_allocator.lock().expect("Poisoned lock"); let mut gsis = Vec::with_capacity(gsi_count as usize); for _ in 0..gsi_count { - match self.gsi_allocator.allocate_id() { + match gsi_allocator.allocate_id() { Ok(gsi) => gsis.push(gsi), Err(err) => { // It is ok to unwrap here, we just allocated the GSI gsis.into_iter().for_each(|gsi| { - self.gsi_allocator.free_id(gsi).unwrap(); + gsi_allocator.free_id(gsi).unwrap(); }); return Err(err); } @@ -76,13 +92,15 @@ impl ResourceAllocator { /// * `alignment` - The alignment of the address of the first byte /// * `policy` - A [`vm_allocator::AllocPolicy`] variant for determining the allocation policy pub fn allocate_32bit_mmio_memory( - &mut self, + &self, size: u64, alignment: u64, policy: AllocPolicy, ) -> Result { Ok(self .mmio32_memory + .lock() + .expect("Poisoned lock") .allocate(size, alignment, policy)? .start()) } @@ -97,13 +115,15 @@ impl ResourceAllocator { /// * `alignment` - The alignment of the address of the first byte /// * `policy` - A [`vm_allocator::AllocPolicy`] variant for determining the allocation policy pub fn allocate_64bit_mmio_memory( - &mut self, + &self, size: u64, alignment: u64, policy: AllocPolicy, ) -> Result { Ok(self .mmio64_memory + .lock() + .expect("Poisoned lock") .allocate(size, alignment, policy)? .start()) } @@ -118,18 +138,33 @@ impl ResourceAllocator { /// * `alignment` - The alignment of the address of the first byte /// * `policy` - A [`vm_allocator::AllocPolicy`] variant for determining the allocation policy pub fn allocate_system_memory( - &mut self, + &self, size: u64, alignment: u64, policy: AllocPolicy, ) -> Result { Ok(self .system_memory + .lock() + .expect("Poisoned lock") .allocate(size, alignment, policy)? .start()) } } +impl DeviceRelocation for ResourceAllocator { + fn move_bar( + &self, + _old_base: u64, + _new_base: u64, + _len: u64, + _pci_dev: &mut dyn pci::PciDevice, + _region_type: pci::PciBarRegionType, + ) -> Result<(), std::io::Error> { + todo!() + } +} + #[cfg(test)] mod tests { use super::ResourceAllocator; @@ -139,7 +174,7 @@ mod tests { #[test] fn test_allocate_gsi() { - let mut allocator = ResourceAllocator::new().unwrap(); + let allocator = ResourceAllocator::new().unwrap(); // asking for 0 IRQs should return us an empty vector assert_eq!(allocator.allocate_gsi(0), Ok(vec![])); // We cannot allocate more GSIs than available @@ -160,7 +195,7 @@ mod tests { // But we should be able to ask for 0 GSIs assert_eq!(allocator.allocate_gsi(0), Ok(vec![])); - let mut allocator = ResourceAllocator::new().unwrap(); + let allocator = ResourceAllocator::new().unwrap(); // We should be able to allocate 1 GSI assert_eq!(allocator.allocate_gsi(1), Ok(vec![arch::GSI_BASE])); // We can't allocate MAX_IRQS any more diff --git a/src/vmm/src/devices/acpi/vmgenid.rs b/src/vmm/src/devices/acpi/vmgenid.rs index 31dbf64ec39..df0656bfbcc 100644 --- a/src/vmm/src/devices/acpi/vmgenid.rs +++ b/src/vmm/src/devices/acpi/vmgenid.rs @@ -86,7 +86,7 @@ impl VmGenId { /// Allocate memory and a GSI for sending notifications and build the device pub fn new( mem: &GuestMemoryMmap, - resource_allocator: &mut ResourceAllocator, + resource_allocator: &ResourceAllocator, ) -> Result { let gsi = resource_allocator.allocate_gsi(1)?; // The generation ID needs to live in an 8-byte aligned buffer @@ -133,7 +133,7 @@ pub struct VMGenIDState { #[derive(Debug)] pub struct VMGenIdConstructorArgs<'a> { pub mem: &'a GuestMemoryMmap, - pub resource_allocator: &'a mut ResourceAllocator, + pub resource_allocator: &'a ResourceAllocator, } impl<'a> Persist<'a> for VmGenId { diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index b8aca60a00c..8f19e780766 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -369,10 +369,10 @@ impl Vmm { self.vcpus_handles.reserve(vcpu_count); for mut vcpu in vcpus.drain(..) { - vcpu.set_mmio_bus(self.device_manager.mmio_bus.clone()); + vcpu.set_mmio_bus(self.device_manager.resource_allocator.mmio_bus.clone()); #[cfg(target_arch = "x86_64")] vcpu.kvm_vcpu - .set_pio_bus(self.device_manager.pio_bus.clone()); + .set_pio_bus(self.device_manager.resource_allocator.pio_bus.clone()); self.vcpus_handles .push(vcpu.start_threaded(vcpu_seccomp_filter.clone(), barrier.clone())?); From 083d7ce62635579c72d46bce562d18217bc94f7e Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 6 May 2025 17:32:40 +0200 Subject: [PATCH 17/99] pci: add support for PCIe segment Add a PCIe segment which includes a single PCIe root port and a bus. At the moment, the PCIe segment is always enabled. Later commit will make it optional and enable it only when a command line argument flag is passed to Firecracker binary. Signed-off-by: Babis Chalios --- src/firecracker/src/main.rs | 2 +- src/vmm/src/acpi/mod.rs | 4 + src/vmm/src/arch/mod.rs | 4 +- src/vmm/src/builder.rs | 5 + src/vmm/src/device_manager/mod.rs | 14 + src/vmm/src/device_manager/pci_mngr.rs | 45 +++ src/vmm/src/devices/mod.rs | 3 + src/vmm/src/devices/pci/mod.rs | 6 + src/vmm/src/devices/pci/pci_segment.rs | 464 +++++++++++++++++++++++++ 9 files changed, 545 insertions(+), 2 deletions(-) create mode 100644 src/vmm/src/device_manager/pci_mngr.rs create mode 100644 src/vmm/src/devices/pci/mod.rs create mode 100644 src/vmm/src/devices/pci/pci_segment.rs diff --git a/src/firecracker/src/main.rs b/src/firecracker/src/main.rs index 6b01f776729..4d6536d054c 100644 --- a/src/firecracker/src/main.rs +++ b/src/firecracker/src/main.rs @@ -449,7 +449,7 @@ fn main_exec() -> Result<(), MainError> { /// the default the jailer would set). /// /// We do this resizing because the kernel default is 64, with a reallocation happening whenever -/// the tabel fills up. This was happening for some larger microVMs, and reallocating the +/// the table fills up. This was happening for some larger microVMs, and reallocating the /// fdtable while a lot of file descriptors are active (due to being eventfds/timerfds registered /// to epoll) incurs a penalty of 30ms-70ms on the snapshot restore path. fn resize_fdtable() -> Result<(), ResizeFdTableError> { diff --git a/src/vmm/src/acpi/mod.rs b/src/vmm/src/acpi/mod.rs index a9b9b2bfb28..65075781188 100644 --- a/src/vmm/src/acpi/mod.rs +++ b/src/vmm/src/acpi/mod.rs @@ -90,6 +90,10 @@ impl AcpiTableWriter<'_> { .acpi_devices .append_aml_bytes(&mut dsdt_data)?; + if let Some(pci_segment) = &device_manager.pci_devices.pci_segment { + pci_segment.append_aml_bytes(&mut dsdt_data)?; + } + // Architecture specific DSDT data setup_arch_dsdt(&mut dsdt_data)?; diff --git a/src/vmm/src/arch/mod.rs b/src/vmm/src/arch/mod.rs index 0d72241b13c..fbeb9fa0ce0 100644 --- a/src/vmm/src/arch/mod.rs +++ b/src/vmm/src/arch/mod.rs @@ -48,7 +48,9 @@ pub use crate::arch::x86_64::{ layout::GSI_BASE, layout::GSI_MAX, layout::IOAPIC_ADDR, layout::IRQ_BASE, layout::IRQ_MAX, layout::MEM_32BIT_DEVICES_SIZE, layout::MEM_32BIT_DEVICES_START, layout::MEM_64BIT_DEVICES_SIZE, layout::MEM_64BIT_DEVICES_START, layout::MMIO32_MEM_SIZE, - layout::MMIO32_MEM_START, layout::SYSTEM_MEM_SIZE, layout::SYSTEM_MEM_START, load_kernel, + layout::MMIO32_MEM_START, layout::PCI_MMCONFIG_SIZE, layout::PCI_MMCONFIG_START, + layout::PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, layout::SYSTEM_MEM_SIZE, layout::SYSTEM_MEM_START, + load_kernel, }; /// Types of devices that can get attached to this platform. diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 3abac133b82..7b30ef74189 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -24,6 +24,7 @@ use crate::cpu_config::templates::{ }; #[cfg(target_arch = "aarch64")] use crate::device_manager::AttachLegacyMmioDeviceError; +use crate::device_manager::pci_mngr::PciManagerError; use crate::device_manager::{ AttachMmioDeviceError, AttachVmgenidError, DeviceManager, DevicePersistError, DeviceRestoreArgs, }; @@ -71,6 +72,8 @@ pub enum StartMicrovmError { CreateLegacyDevice(device_manager::legacy::LegacyDeviceError), /// Error creating VMGenID device: {0} CreateVMGenID(VmGenIdError), + /// Error enabling PCIe support: {0} + EnablePciDevices(#[from] PciManagerError), /// Error enabling pvtime on vcpu: {0} #[cfg(target_arch = "aarch64")] EnablePVTime(crate::arch::VcpuArchError), @@ -214,6 +217,8 @@ pub fn build_microvm_for_boot( .map(|vcpu| vcpu.copy_kvm_vcpu_fd(vmm.vm())) .collect::, _>>()?; + vmm.device_manager.enable_pci()?; + // The boot timer device needs to be the first device attached in order // to maintain the same MMIO address referenced in the documentation // and tests. diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index bcc71236c63..5c01a195fc5 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -16,6 +16,7 @@ use legacy::{LegacyDeviceError, PortIODeviceManager}; use linux_loader::loader::Cmdline; use log::error; use mmio::{MMIODeviceManager, MmioError}; +use pci_mngr::{PciDevices, PciManagerError}; use persist::{ACPIDeviceManagerConstructorArgs, MMIODevManagerConstructorArgs}; use resources::ResourceAllocator; use serde::{Deserialize, Serialize}; @@ -43,6 +44,8 @@ pub mod acpi; pub mod legacy; /// Memory Mapped I/O Manager. pub mod mmio; +/// PCIe device manager +pub mod pci_mngr; /// Device managers (de)serialization support. pub mod persist; /// Resource manager for devices. @@ -104,6 +107,8 @@ pub struct DeviceManager { pub legacy_devices: PortIODeviceManager, /// ACPI devices pub acpi_devices: ACPIDeviceManager, + /// PCIe devices + pub pci_devices: PciDevices, } impl DeviceManager { @@ -165,6 +170,7 @@ impl DeviceManager { #[cfg(target_arch = "x86_64")] legacy_devices, acpi_devices: ACPIDeviceManager::new(), + pci_devices: PciDevices::new(), }) } @@ -244,6 +250,12 @@ impl DeviceManager { .register_mmio_rtc(&self.resource_allocator, rtc, None)?; Ok(()) } + + /// Enables PCIe support for Firecracker devices + pub fn enable_pci(&mut self) -> Result<(), PciManagerError> { + self.pci_devices + .attach_pci_segment(&self.resource_allocator) + } } #[derive(Debug, Default, Clone, Serialize, Deserialize)] @@ -367,6 +379,7 @@ pub(crate) mod tests { pub(crate) fn default_device_manager() -> DeviceManager { let mmio_devices = MMIODeviceManager::new(); let acpi_devices = ACPIDeviceManager::new(); + let pci_devices = PciDevices::new(); let resource_allocator = Arc::new(ResourceAllocator::new().unwrap()); #[cfg(target_arch = "x86_64")] @@ -386,6 +399,7 @@ pub(crate) mod tests { #[cfg(target_arch = "x86_64")] legacy_devices, acpi_devices, + pci_devices, } } diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs new file mode 100644 index 00000000000..c3bf2ada977 --- /dev/null +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -0,0 +1,45 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::sync::Arc; + +use vm_device::BusError; + +use super::resources::ResourceAllocator; +use crate::devices::pci::PciSegment; + +#[derive(Debug, Default)] +pub struct PciDevices { + /// PCIe segment of the VMM, if PCI is enabled. We currently support a single PCIe segment. + pub pci_segment: Option, +} + +#[derive(Debug, thiserror::Error, displaydoc::Display)] +pub enum PciManagerError { + /// Resource allocation error: {0} + ResourceAllocation(#[from] vm_allocator::Error), + /// Bus error: {0} + Bus(#[from] BusError), +} + +impl PciDevices { + pub fn new() -> Self { + Default::default() + } + + pub fn attach_pci_segment( + &mut self, + resource_allocator: &Arc, + ) -> Result<(), PciManagerError> { + // We only support a single PCIe segment. Calling this function twice is a Firecracker + // internal error. + assert!(self.pci_segment.is_none()); + + // Currently we don't assign any IRQs to PCI devices. We will be using MSI-X interrupts + // only. + let pci_segment = PciSegment::new(0, resource_allocator, &[0u8; 32])?; + self.pci_segment = Some(pci_segment); + + Ok(()) + } +} diff --git a/src/vmm/src/devices/mod.rs b/src/vmm/src/devices/mod.rs index dd58acc9337..371cc2cfa9e 100644 --- a/src/vmm/src/devices/mod.rs +++ b/src/vmm/src/devices/mod.rs @@ -7,10 +7,13 @@ //! Emulates virtual and hardware devices. +#![allow(unused)] + use std::io; pub mod acpi; pub mod legacy; +pub mod pci; pub mod pseudo; pub mod virtio; diff --git a/src/vmm/src/devices/pci/mod.rs b/src/vmm/src/devices/pci/mod.rs new file mode 100644 index 00000000000..e365b481893 --- /dev/null +++ b/src/vmm/src/devices/pci/mod.rs @@ -0,0 +1,6 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +pub mod pci_segment; + +pub use pci_segment::*; diff --git a/src/vmm/src/devices/pci/pci_segment.rs b/src/vmm/src/devices/pci/pci_segment.rs new file mode 100644 index 00000000000..169ffdcba3b --- /dev/null +++ b/src/vmm/src/devices/pci/pci_segment.rs @@ -0,0 +1,464 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// Copyright © 2019 - 2021 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause +// + +use std::sync::{Arc, Mutex}; + +#[cfg(target_arch = "x86_64")] +use acpi_tables::{Aml, aml}; +use log::info; +#[cfg(target_arch = "x86_64")] +use pci::{PCI_CONFIG_IO_PORT, PCI_CONFIG_IO_PORT_SIZE, PciConfigIo}; +use pci::{PciBdf, PciBus, PciConfigMmio, PciRoot, PciRootError}; +use uuid::Uuid; +use vm_allocator::AddressAllocator; +use vm_device::{BusDeviceSync, BusError}; + +use crate::arch::{PCI_MMCONFIG_START, PCI_MMIO_CONFIG_SIZE_PER_SEGMENT}; +use crate::device_manager::resources::ResourceAllocator; + +pub struct PciSegment { + pub(crate) id: u16, + pub(crate) pci_bus: Arc>, + pub(crate) pci_config_mmio: Arc>, + pub(crate) mmio_config_address: u64, + pub(crate) proximity_domain: u32, + + #[cfg(target_arch = "x86_64")] + pub(crate) pci_config_io: Option>>, + + // Bitmap of PCI devices to hotplug. + pub(crate) pci_devices_up: u32, + // Bitmap of PCI devices to hotunplug. + pub(crate) pci_devices_down: u32, + // List of allocated IRQs for each PCI slot. + pub(crate) pci_irq_slots: [u8; 32], + + // Device memory covered by this segment + pub(crate) start_of_mem32_area: u64, + pub(crate) end_of_mem32_area: u64, + + pub(crate) start_of_mem64_area: u64, + pub(crate) end_of_mem64_area: u64, +} + +impl std::fmt::Debug for PciSegment { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("PciSegment") + .field("id", &self.id) + .field("mmio_config_address", &self.mmio_config_address) + .field("proximity_domain", &self.proximity_domain) + .field("pci_devices_up", &self.pci_devices_up) + .field("pci_devices_down", &self.pci_devices_down) + .field("pci_irq_slots", &self.pci_irq_slots) + .field("start_of_mem32_area", &self.start_of_mem32_area) + .field("end_of_mem32_area", &self.end_of_mem32_area) + .field("start_of_mem64_area", &self.start_of_mem64_area) + .field("end_of_mem64_area", &self.end_of_mem64_area) + .finish() + } +} + +impl PciSegment { + fn build( + id: u16, + resource_allocator: &Arc, + pci_irq_slots: &[u8; 32], + ) -> Result { + let pci_root = PciRoot::new(None); + let pci_bus = Arc::new(Mutex::new(PciBus::new( + pci_root, + resource_allocator.clone(), + ))); + + let pci_config_mmio = Arc::new(Mutex::new(PciConfigMmio::new(Arc::clone(&pci_bus)))); + let mmio_config_address = PCI_MMCONFIG_START + PCI_MMIO_CONFIG_SIZE_PER_SEGMENT * id as u64; + + resource_allocator.mmio_bus.insert( + Arc::clone(&pci_config_mmio) as Arc, + mmio_config_address, + PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, + )?; + + let mem32_allocator = resource_allocator.mmio32_memory.clone(); + let mem64_allocator = resource_allocator.mmio64_memory.clone(); + + let start_of_mem32_area = mem32_allocator.lock().unwrap().base(); + let end_of_mem32_area = mem32_allocator.lock().unwrap().end(); + + let start_of_mem64_area = mem64_allocator.lock().unwrap().base(); + let end_of_mem64_area = mem64_allocator.lock().unwrap().end(); + + let segment = PciSegment { + id, + pci_bus, + pci_config_mmio, + mmio_config_address, + proximity_domain: 0, + pci_devices_up: 0, + pci_devices_down: 0, + #[cfg(target_arch = "x86_64")] + pci_config_io: None, + start_of_mem32_area, + end_of_mem32_area, + start_of_mem64_area, + end_of_mem64_area, + pci_irq_slots: *pci_irq_slots, + }; + + Ok(segment) + } + + #[cfg(target_arch = "x86_64")] + pub(crate) fn new( + id: u16, + resource_allocator: &Arc, + pci_irq_slots: &[u8; 32], + ) -> Result { + let mut segment = Self::build(id, resource_allocator, pci_irq_slots)?; + let pci_config_io = Arc::new(Mutex::new(PciConfigIo::new(Arc::clone(&segment.pci_bus)))); + + resource_allocator.pio_bus.insert( + pci_config_io.clone(), + PCI_CONFIG_IO_PORT, + PCI_CONFIG_IO_PORT_SIZE, + )?; + + segment.pci_config_io = Some(pci_config_io); + + info!( + "pci: adding PCI segment: id={:#x}, PCI MMIO config address: {:#x}, mem32 area: \ + [{:#x}-{:#x}], mem64 area: [{:#x}-{:#x}] IO area: [{PCI_CONFIG_IO_PORT:#x}-{:#x}]", + segment.id, + segment.mmio_config_address, + segment.start_of_mem32_area, + segment.end_of_mem32_area, + segment.start_of_mem64_area, + segment.end_of_mem64_area, + PCI_CONFIG_IO_PORT + PCI_CONFIG_IO_PORT_SIZE - 1 + ); + + Ok(segment) + } + + #[cfg(target_arch = "aarch64")] + pub(crate) fn new( + id: u16, + resource_allocator: &Arc, + pci_irq_slots: &[u8; 32], + ) -> Result { + let segment = Self::build(id, resource_allocator, pci_irq_slots)?; + info!( + "pci: adding PCI segment: id={:#x}, PCI MMIO config address: {:#x}, mem32 area: \ + [{:#x}-{:#x}], mem64 area: [{:#x}-{:#x}]", + segment.id, + segment.mmio_config_address, + segment.start_of_mem32_area, + segment.end_of_mem32_area, + segment.start_of_mem64_area, + segment.end_of_mem64_area, + ); + + Ok(segment) + } + + pub(crate) fn next_device_bdf(&self) -> Result { + Ok(PciBdf::new( + self.id, + 0, + self.pci_bus + .lock() + .unwrap() + .next_device_id()? + .try_into() + .unwrap(), + 0, + )) + } +} + +#[cfg(target_arch = "x86_64")] +struct PciDevSlot { + device_id: u8, +} + +#[cfg(target_arch = "x86_64")] +impl Aml for PciDevSlot { + fn append_aml_bytes(&self, v: &mut Vec) -> Result<(), aml::AmlError> { + let sun = self.device_id; + let adr: u32 = (self.device_id as u32) << 16; + aml::Device::new( + format!("S{:03}", self.device_id).as_str().try_into()?, + vec![ + &aml::Name::new("_SUN".try_into()?, &sun)?, + &aml::Name::new("_ADR".try_into()?, &adr)?, + &aml::Method::new( + "_EJ0".try_into()?, + 1, + true, + vec![&aml::MethodCall::new( + "\\_SB_.PHPR.PCEJ".try_into()?, + vec![&aml::Path::new("_SUN")?, &aml::Path::new("_SEG")?], + )], + ), + ], + ) + .append_aml_bytes(v) + } +} + +#[cfg(target_arch = "x86_64")] +struct PciDevSlotNotify { + device_id: u8, +} + +#[cfg(target_arch = "x86_64")] +impl Aml for PciDevSlotNotify { + fn append_aml_bytes(&self, v: &mut Vec) -> Result<(), aml::AmlError> { + let device_id_mask: u32 = 1 << self.device_id; + let object = aml::Path::new(&format!("S{:03}", self.device_id))?; + aml::And::new(&aml::Local(0), &aml::Arg(0), &device_id_mask).append_aml_bytes(v)?; + aml::If::new( + &aml::Equal::new(&aml::Local(0), &device_id_mask), + vec![&aml::Notify::new(&object, &aml::Arg(1))], + ) + .append_aml_bytes(v) + } +} + +#[cfg(target_arch = "x86_64")] +struct PciDevSlotMethods {} + +#[cfg(target_arch = "x86_64")] +impl Aml for PciDevSlotMethods { + fn append_aml_bytes(&self, v: &mut Vec) -> Result<(), aml::AmlError> { + let mut device_notifies = Vec::new(); + for device_id in 0..32 { + device_notifies.push(PciDevSlotNotify { device_id }); + } + + let mut device_notifies_refs: Vec<&dyn Aml> = Vec::new(); + for device_notify in device_notifies.iter() { + device_notifies_refs.push(device_notify); + } + + aml::Method::new("DVNT".try_into()?, 2, true, device_notifies_refs).append_aml_bytes(v)?; + aml::Method::new( + "PCNT".try_into()?, + 0, + true, + vec![ + &aml::Acquire::new("\\_SB_.PHPR.BLCK".try_into()?, 0xffff), + &aml::Store::new( + &aml::Path::new("\\_SB_.PHPR.PSEG")?, + &aml::Path::new("_SEG")?, + ), + &aml::MethodCall::new( + "DVNT".try_into()?, + vec![&aml::Path::new("\\_SB_.PHPR.PCIU")?, &aml::ONE], + ), + &aml::MethodCall::new( + "DVNT".try_into()?, + vec![&aml::Path::new("\\_SB_.PHPR.PCID")?, &3usize], + ), + &aml::Release::new("\\_SB_.PHPR.BLCK".try_into()?), + ], + ) + .append_aml_bytes(v) + } +} + +#[cfg(target_arch = "x86_64")] +struct PciDsmMethod {} + +#[cfg(target_arch = "x86_64")] +impl Aml for PciDsmMethod { + fn append_aml_bytes(&self, v: &mut Vec) -> Result<(), aml::AmlError> { + // Refer to ACPI spec v6.3 Ch 9.1.1 and PCI Firmware spec v3.3 Ch 4.6.1 + // _DSM (Device Specific Method), the following is the implementation in ASL. + + // Method (_DSM, 4, NotSerialized) // _DSM: Device-Specific Method + // { + // If ((Arg0 == ToUUID ("e5c937d0-3553-4d7a-9117-ea4d19c3434d") /* Device Labeling + // Interface */)) { + // If ((Arg2 == Zero)) + // { + // Return (Buffer (One) { 0x21 }) + // } + // If ((Arg2 == 0x05)) + // { + // Return (Zero) + // } + // } + // + // Return (Buffer (One) { 0x00 }) + // } + // + // As per ACPI v6.3 Ch 19.6.142, the UUID is required to be in mixed endian: + // Among the fields of a UUID: + // {d1 (8 digits)} - {d2 (4 digits)} - {d3 (4 digits)} - {d4 (16 digits)} + // d1 ~ d3 need to be little endian, d4 be big endian. + // See https://en.wikipedia.org/wiki/Universally_unique_identifier#Encoding . + let uuid = Uuid::parse_str("E5C937D0-3553-4D7A-9117-EA4D19C3434D").unwrap(); + let (uuid_d1, uuid_d2, uuid_d3, uuid_d4) = uuid.as_fields(); + let mut uuid_buf = vec![]; + uuid_buf.extend(uuid_d1.to_le_bytes()); + uuid_buf.extend(uuid_d2.to_le_bytes()); + uuid_buf.extend(uuid_d3.to_le_bytes()); + uuid_buf.extend(uuid_d4); + aml::Method::new( + "_DSM".try_into()?, + 4, + false, + vec![ + &aml::If::new( + &aml::Equal::new(&aml::Arg(0), &aml::Buffer::new(uuid_buf)), + vec![ + &aml::If::new( + &aml::Equal::new(&aml::Arg(2), &aml::ZERO), + vec![&aml::Return::new(&aml::Buffer::new(vec![0x21]))], + ), + &aml::If::new( + &aml::Equal::new(&aml::Arg(2), &0x05u8), + vec![&aml::Return::new(&aml::ZERO)], + ), + ], + ), + &aml::Return::new(&aml::Buffer::new(vec![0])), + ], + ) + .append_aml_bytes(v) + } +} + +#[cfg(target_arch = "x86_64")] +impl Aml for PciSegment { + fn append_aml_bytes(&self, v: &mut Vec) -> Result<(), aml::AmlError> { + let mut pci_dsdt_inner_data: Vec<&dyn Aml> = Vec::new(); + let hid = aml::Name::new("_HID".try_into()?, &aml::EisaName::new("PNP0A08")?)?; + pci_dsdt_inner_data.push(&hid); + let cid = aml::Name::new("_CID".try_into()?, &aml::EisaName::new("PNP0A03")?)?; + pci_dsdt_inner_data.push(&cid); + let adr = aml::Name::new("_ADR".try_into()?, &aml::ZERO)?; + pci_dsdt_inner_data.push(&adr); + let seg = aml::Name::new("_SEG".try_into()?, &self.id)?; + pci_dsdt_inner_data.push(&seg); + let uid = aml::Name::new("_UID".try_into()?, &aml::ZERO)?; + pci_dsdt_inner_data.push(&uid); + let cca = aml::Name::new("_CCA".try_into()?, &aml::ONE)?; + pci_dsdt_inner_data.push(&cca); + let supp = aml::Name::new("SUPP".try_into()?, &aml::ZERO)?; + pci_dsdt_inner_data.push(&supp); + + let proximity_domain = self.proximity_domain; + let pxm_return = aml::Return::new(&proximity_domain); + let pxm = aml::Method::new("_PXM".try_into()?, 0, false, vec![&pxm_return]); + pci_dsdt_inner_data.push(&pxm); + + let pci_dsm = PciDsmMethod {}; + pci_dsdt_inner_data.push(&pci_dsm); + + #[allow(clippy::if_same_then_else)] + let crs = if self.id == 0 { + aml::Name::new( + "_CRS".try_into()?, + &aml::ResourceTemplate::new(vec![ + &aml::AddressSpace::new_bus_number(0x0u16, 0x0u16)?, + &aml::Io::new(0xcf8, 0xcf8, 1, 0x8), + &aml::Memory32Fixed::new( + true, + self.mmio_config_address.try_into().unwrap(), + PCI_MMIO_CONFIG_SIZE_PER_SEGMENT.try_into().unwrap(), + ), + &aml::AddressSpace::new_memory( + aml::AddressSpaceCacheable::NotCacheable, + true, + self.start_of_mem32_area, + self.end_of_mem32_area, + )?, + &aml::AddressSpace::new_memory( + aml::AddressSpaceCacheable::NotCacheable, + true, + self.start_of_mem64_area, + self.end_of_mem64_area, + )?, + &aml::AddressSpace::new_io(0u16, 0x0cf7u16)?, + &aml::AddressSpace::new_io(0x0d00u16, 0xffffu16)?, + ]), + )? + } else { + aml::Name::new( + "_CRS".try_into()?, + &aml::ResourceTemplate::new(vec![ + &aml::AddressSpace::new_bus_number(0x0u16, 0x0u16)?, + &aml::Memory32Fixed::new( + true, + self.mmio_config_address.try_into().unwrap(), + PCI_MMIO_CONFIG_SIZE_PER_SEGMENT.try_into().unwrap(), + ), + &aml::AddressSpace::new_memory( + aml::AddressSpaceCacheable::NotCacheable, + true, + self.start_of_mem32_area, + self.end_of_mem32_area, + )?, + &aml::AddressSpace::new_memory( + aml::AddressSpaceCacheable::NotCacheable, + true, + self.start_of_mem64_area, + self.end_of_mem64_area, + )?, + ]), + )? + }; + pci_dsdt_inner_data.push(&crs); + + let mut pci_devices = Vec::new(); + for device_id in 0..32 { + let pci_device = PciDevSlot { device_id }; + pci_devices.push(pci_device); + } + for pci_device in pci_devices.iter() { + pci_dsdt_inner_data.push(pci_device); + } + + let pci_device_methods = PciDevSlotMethods {}; + pci_dsdt_inner_data.push(&pci_device_methods); + + // Build PCI routing table, listing IRQs assigned to PCI devices. + let prt_package_list: Vec<(u32, u32)> = self + .pci_irq_slots + .iter() + .enumerate() + .map(|(i, irq)| { + ( + ((((u32::try_from(i).unwrap()) & 0x1fu32) << 16) | 0xffffu32), + *irq as u32, + ) + }) + .collect(); + let prt_package_list: Vec = prt_package_list + .iter() + .map(|(bdf, irq)| aml::Package::new(vec![bdf, &0u8, &0u8, irq])) + .collect(); + let prt_package_list: Vec<&dyn Aml> = prt_package_list + .iter() + .map(|item| item as &dyn Aml) + .collect(); + let prt = aml::Name::new("_PRT".try_into()?, &aml::Package::new(prt_package_list))?; + pci_dsdt_inner_data.push(&prt); + + aml::Device::new( + format!("_SB_.PC{:02X}", self.id).as_str().try_into()?, + pci_dsdt_inner_data, + ) + .append_aml_bytes(v) + } +} From 9bb2d5e5d4c5e9f09f08459b1c11f7ce572565ee Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 7 May 2025 16:40:54 +0200 Subject: [PATCH 18/99] pci: add support for ACPI MCFG table So that we can declare which memory region is used by PCIe devices for MMCONFIG. Signed-off-by: Babis Chalios --- src/acpi-tables/src/lib.rs | 6 ++- src/acpi-tables/src/mcfg.rs | 77 +++++++++++++++++++++++++++++++++++++ src/vmm/src/acpi/mod.rs | 27 +++++++++++-- 3 files changed, 105 insertions(+), 5 deletions(-) create mode 100644 src/acpi-tables/src/mcfg.rs diff --git a/src/acpi-tables/src/lib.rs b/src/acpi-tables/src/lib.rs index 321328047ed..d3b7df0791e 100644 --- a/src/acpi-tables/src/lib.rs +++ b/src/acpi-tables/src/lib.rs @@ -10,6 +10,7 @@ pub mod aml; pub mod dsdt; pub mod fadt; pub mod madt; +pub mod mcfg; pub mod rsdp; pub mod xsdt; @@ -17,6 +18,7 @@ pub use aml::Aml; pub use dsdt::Dsdt; pub use fadt::Fadt; pub use madt::Madt; +pub use mcfg::Mcfg; pub use rsdp::Rsdp; pub use xsdt::Xsdt; use zerocopy::little_endian::{U32, U64}; @@ -89,7 +91,7 @@ pub struct SdtHeader { pub oem_table_id: [u8; 8], pub oem_revision: U32, pub creator_id: [u8; 4], - pub creator_revison: U32, + pub creator_revision: U32, } impl SdtHeader { @@ -110,7 +112,7 @@ impl SdtHeader { oem_table_id, oem_revision: U32::new(oem_revision), creator_id: FC_ACPI_CREATOR_ID, - creator_revison: U32::new(FC_ACPI_CREATOR_REVISION), + creator_revision: U32::new(FC_ACPI_CREATOR_REVISION), } } } diff --git a/src/acpi-tables/src/mcfg.rs b/src/acpi-tables/src/mcfg.rs new file mode 100644 index 00000000000..a5dd8b9d227 --- /dev/null +++ b/src/acpi-tables/src/mcfg.rs @@ -0,0 +1,77 @@ +// Copyright © 2019 Intel Corporation +// Copyright © 2023 Rivos, Inc. +// Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +use std::mem::size_of; + +use vm_memory::{Bytes, GuestAddress, GuestMemory}; +use zerocopy::{Immutable, IntoBytes}; + +use crate::{Result, Sdt, SdtHeader, checksum}; + +#[allow(dead_code)] +#[repr(C, packed)] +#[derive(Default, Debug, IntoBytes, Clone, Copy, Immutable)] +struct PciRangeEntry { + pub base_address: u64, + pub segment: u16, + pub start: u8, + pub end: u8, + _reserved: u32, +} + +#[allow(dead_code)] +#[repr(C, packed)] +#[derive(Clone, Copy, Debug, Default, IntoBytes, Immutable)] +pub struct Mcfg { + header: SdtHeader, + _reserved: u64, + pci_range_entry: PciRangeEntry, +} + +impl Mcfg { + pub fn new( + oem_id: [u8; 6], + oem_table_id: [u8; 8], + oem_revision: u32, + pci_mmio_config_addr: u64, + ) -> Self { + let header = SdtHeader::new( + *b"MCFG", + size_of::().try_into().unwrap(), + 1, + oem_id, + oem_table_id, + oem_revision, + ); + + let mut mcfg = Mcfg { + header, + pci_range_entry: PciRangeEntry { + base_address: pci_mmio_config_addr, + segment: 0, + start: 0, + end: 0, + ..Default::default() + }, + ..Default::default() + }; + + mcfg.header.checksum = checksum(&[mcfg.as_bytes()]); + + mcfg + } +} + +impl Sdt for Mcfg { + fn len(&self) -> usize { + self.as_bytes().len() + } + + fn write_to_guest(&mut self, mem: &M, address: GuestAddress) -> Result<()> { + mem.write_slice(self.as_bytes(), address)?; + Ok(()) + } +} diff --git a/src/vmm/src/acpi/mod.rs b/src/vmm/src/acpi/mod.rs index 65075781188..a3e471aed9e 100644 --- a/src/vmm/src/acpi/mod.rs +++ b/src/vmm/src/acpi/mod.rs @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use acpi_tables::fadt::{FADT_F_HW_REDUCED_ACPI, FADT_F_PWR_BUTTON, FADT_F_SLP_BUTTON}; -use acpi_tables::{Aml, Dsdt, Fadt, Madt, Rsdp, Sdt, Xsdt, aml}; +use acpi_tables::{Aml, Dsdt, Fadt, Madt, Mcfg, Rsdp, Sdt, Xsdt, aml}; use log::{debug, error}; use vm_allocator::AllocPolicy; @@ -10,6 +10,7 @@ use crate::Vcpu; use crate::acpi::x86_64::{ apic_addr, rsdp_addr, setup_arch_dsdt, setup_arch_fadt, setup_interrupt_controllers, }; +use crate::arch::x86_64::layout; use crate::device_manager::DeviceManager; use crate::device_manager::resources::ResourceAllocator; use crate::vstate::memory::{GuestAddress, GuestMemoryMmap}; @@ -145,16 +146,27 @@ impl AcpiTableWriter<'_> { resource_allocator: &ResourceAllocator, fadt_addr: u64, madt_addr: u64, + mcfg_addr: u64, ) -> Result { let mut xsdt = Xsdt::new( OEM_ID, *b"FCMVXSDT", OEM_REVISION, - vec![fadt_addr, madt_addr], + vec![fadt_addr, madt_addr, mcfg_addr], ); self.write_acpi_table(resource_allocator, &mut xsdt) } + /// Build the MCFG table for the guest. + fn build_mcfg( + &mut self, + resource_allocator: &ResourceAllocator, + pci_mmio_config_addr: u64, + ) -> Result { + let mut mcfg = Mcfg::new(OEM_ID, *b"FCMVMCFG", OEM_REVISION, pci_mmio_config_addr); + self.write_acpi_table(resource_allocator, &mut mcfg) + } + /// Build the RSDP pointer for the guest. /// /// This will build the RSDP pointer which points to the XSDT table and write it in guest @@ -191,7 +203,16 @@ pub(crate) fn create_acpi_tables( &device_manager.resource_allocator, vcpus.len().try_into().unwrap(), )?; - let xsdt_addr = writer.build_xsdt(&device_manager.resource_allocator, fadt_addr, madt_addr)?; + let mcfg_addr = writer.build_mcfg( + &device_manager.resource_allocator, + layout::PCI_MMCONFIG_START, + )?; + let xsdt_addr = writer.build_xsdt( + &device_manager.resource_allocator, + fadt_addr, + madt_addr, + mcfg_addr, + )?; writer.build_rsdp(xsdt_addr) } From dc1202fee0720d67234fe10f9707a3242de83c5b Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 14 May 2025 17:35:49 +0200 Subject: [PATCH 19/99] pci: define PCI segment in FDT Write the PCI root bridge in FDT when PCI is enabled. Signed-off-by: Babis Chalios --- src/vmm/src/arch/aarch64/fdt.rs | 63 +++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/src/vmm/src/arch/aarch64/fdt.rs b/src/vmm/src/arch/aarch64/fdt.rs index e2a42dd2982..c642373a016 100644 --- a/src/vmm/src/arch/aarch64/fdt.rs +++ b/src/vmm/src/arch/aarch64/fdt.rs @@ -13,8 +13,13 @@ use vm_memory::GuestMemoryError; use super::cache_info::{CacheEntry, read_cache_config}; use super::gic::GICDevice; +use crate::arch::{ + MEM_32BIT_DEVICES_SIZE, MEM_32BIT_DEVICES_START, MEM_64BIT_DEVICES_SIZE, + MEM_64BIT_DEVICES_START, PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, +}; use crate::device_manager::DeviceManager; use crate::device_manager::mmio::MMIODeviceInfo; +use crate::device_manager::pci_mngr::PciDevices; use crate::devices::acpi::vmgenid::{VMGENID_MEM_SIZE, VmGenId}; use crate::initrd::InitrdConfig; use crate::vstate::memory::{Address, GuestMemory, GuestMemoryMmap}; @@ -90,6 +95,7 @@ pub fn create_fdt( create_psci_node(&mut fdt_writer)?; create_devices_node(&mut fdt_writer, device_manager)?; create_vmgenid_node(&mut fdt_writer, &device_manager.acpi_devices.vmgenid)?; + create_pci_nodes(&mut fdt_writer, &device_manager.pci_devices)?; // End Header node. fdt_writer.end_node(root)?; @@ -431,6 +437,63 @@ fn create_devices_node( Ok(()) } +fn create_pci_nodes(fdt: &mut FdtWriter, pci_devices: &PciDevices) -> Result<(), FdtError> { + if pci_devices.pci_segment.is_none() { + return Ok(()); + } + + // Fine to unwrap here, we just checked it's not `None`. + let segment = pci_devices.pci_segment.as_ref().unwrap(); + + let pci_node_name = format!("pci@{:x}", segment.mmio_config_address); + // Each range here is a thruple of `(PCI address, CPU address, PCI size)`. + // + // More info about the format can be found here: + // https://elinux.org/Device_Tree_Usage#PCI_Address_Translation + let ranges = [ + // 32bit addresses + 0x200_0000u32, + (MEM_32BIT_DEVICES_START >> 32) as u32, // PCI address + (MEM_32BIT_DEVICES_START & 0xffff_ffff) as u32, + (MEM_32BIT_DEVICES_START >> 32) as u32, // CPU address + (MEM_32BIT_DEVICES_START & 0xffff_ffff) as u32, + (MEM_32BIT_DEVICES_SIZE >> 32) as u32, // Range size + (MEM_32BIT_DEVICES_SIZE & 0xffff_ffff) as u32, + // 64bit addresses + 0x300_0000u32, + // PCI address + (MEM_64BIT_DEVICES_START >> 32) as u32, // PCI address + (MEM_64BIT_DEVICES_START & 0xffff_ffff) as u32, + // CPU address + (MEM_64BIT_DEVICES_START >> 32) as u32, // CPU address + (MEM_64BIT_DEVICES_START & 0xffff_ffff) as u32, + // Range size + (MEM_64BIT_DEVICES_SIZE >> 32) as u32, // Range size + ((MEM_64BIT_DEVICES_SIZE & 0xffff_ffff) >> 32) as u32, + ]; + let pci_node = fdt.begin_node(&pci_node_name)?; + + fdt.property_string("compatible", "pci-host-ecam-generic")?; + fdt.property_string("device_type", "pci")?; + fdt.property_array_u32("ranges", &ranges)?; + fdt.property_array_u32("bus-range", &[0, 0])?; + fdt.property_u32("linux,pci-domain", segment.id.into())?; + fdt.property_u32("#address-cells", 3)?; + fdt.property_u32("#size-cells", 2)?; + fdt.property_array_u64( + "reg", + &[ + segment.mmio_config_address, + PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, + ], + )?; + fdt.property_u32("#interrupt-cells", 1)?; + fdt.property_null("interrupt-map")?; + fdt.property_null("interrupt-map-mask")?; + fdt.property_null("dma-coherent")?; + Ok(fdt.end_node(pci_node)?) +} + #[cfg(test)] mod tests { use std::ffi::CString; From 7340a6612648fbc2679a67bdf5dc3bb5776d567a Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 13 May 2025 10:42:04 +0200 Subject: [PATCH 20/99] pci: make PCIe support optional Add a command line argument to enable PCIe support. By default, PCIe is disabled. The reason for making PCIe off by default is that users need to explicitly enable PCI support in their kernels. Requiring users to explicitly enable it, does not break existing deployments, i.e. users can upgrade Firecracker within their existing environments without breaking any deployment. Signed-off-by: Babis Chalios --- src/firecracker/src/api_server_adapter.rs | 3 +++ src/firecracker/src/main.rs | 13 +++++++++++++ src/vmm/src/builder.rs | 6 +++++- src/vmm/src/resources.rs | 3 +++ src/vmm/src/rpc_interface.rs | 16 +++++++--------- src/vmm/src/vmm_config/boot_source.rs | 3 +-- 6 files changed, 32 insertions(+), 12 deletions(-) diff --git a/src/firecracker/src/api_server_adapter.rs b/src/firecracker/src/api_server_adapter.rs index 173ef298265..f597a5f7db9 100644 --- a/src/firecracker/src/api_server_adapter.rs +++ b/src/firecracker/src/api_server_adapter.rs @@ -143,6 +143,7 @@ pub(crate) fn run_with_api( instance_info: InstanceInfo, process_time_reporter: ProcessTimeReporter, boot_timer_enabled: bool, + pci_enabled: bool, api_payload_limit: usize, mmds_size_limit: usize, metadata_json: Option<&str>, @@ -212,6 +213,7 @@ pub(crate) fn run_with_api( json, instance_info, boot_timer_enabled, + pci_enabled, mmds_size_limit, metadata_json, ) @@ -224,6 +226,7 @@ pub(crate) fn run_with_api( &to_api, &api_event_fd, boot_timer_enabled, + pci_enabled, mmds_size_limit, metadata_json, ) diff --git a/src/firecracker/src/main.rs b/src/firecracker/src/main.rs index 4d6536d054c..3e6ad35d6a9 100644 --- a/src/firecracker/src/main.rs +++ b/src/firecracker/src/main.rs @@ -260,6 +260,11 @@ fn main_exec() -> Result<(), MainError> { Argument::new("mmds-size-limit") .takes_value(true) .help("Mmds data store limit, in bytes."), + ) + .arg( + Argument::new("enable-pci") + .takes_value(false) + .help("Enables PCIe support."), ); arg_parser.parse_from_cmdline()?; @@ -369,6 +374,7 @@ fn main_exec() -> Result<(), MainError> { .map(|x| x.expect("Unable to open or read from the mmds content file")); let boot_timer_enabled = arguments.flag_present("boot-timer"); + let pci_enabled = arguments.flag_present("enable-pci"); let api_enabled = !arguments.flag_present("no-api"); let api_payload_limit = arg_parser .arguments() @@ -422,6 +428,7 @@ fn main_exec() -> Result<(), MainError> { instance_info, process_time_reporter, boot_timer_enabled, + pci_enabled, api_payload_limit, mmds_size_limit, metadata_json.as_deref(), @@ -437,6 +444,7 @@ fn main_exec() -> Result<(), MainError> { vmm_config_json, instance_info, boot_timer_enabled, + pci_enabled, mmds_size_limit, metadata_json.as_deref(), ) @@ -554,12 +562,14 @@ pub enum BuildFromJsonError { } // Configure and start a microVM as described by the command-line JSON. +#[allow(clippy::too_many_arguments)] fn build_microvm_from_json( seccomp_filters: &BpfThreadMap, event_manager: &mut EventManager, config_json: String, instance_info: InstanceInfo, boot_timer_enabled: bool, + pci_enabled: bool, mmds_size_limit: usize, metadata_json: Option<&str>, ) -> Result<(VmResources, Arc>), BuildFromJsonError> { @@ -567,6 +577,7 @@ fn build_microvm_from_json( VmResources::from_json(&config_json, &instance_info, mmds_size_limit, metadata_json) .map_err(BuildFromJsonError::ParseFromJson)?; vm_resources.boot_timer = boot_timer_enabled; + vm_resources.pci_enabled = pci_enabled; let vmm = vmm::builder::build_and_boot_microvm( &instance_info, &vm_resources, @@ -593,6 +604,7 @@ fn run_without_api( config_json: Option, instance_info: InstanceInfo, bool_timer_enabled: bool, + pci_enabled: bool, mmds_size_limit: usize, metadata_json: Option<&str>, ) -> Result<(), RunWithoutApiError> { @@ -610,6 +622,7 @@ fn run_without_api( config_json.unwrap(), instance_info, bool_timer_enabled, + pci_enabled, mmds_size_limit, metadata_json, ) diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 7b30ef74189..be590ded918 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -217,7 +217,11 @@ pub fn build_microvm_for_boot( .map(|vcpu| vcpu.copy_kvm_vcpu_fd(vmm.vm())) .collect::, _>>()?; - vmm.device_manager.enable_pci()?; + if vm_resources.pci_enabled { + vmm.device_manager.enable_pci()?; + } else { + boot_cmdline.insert("pci", "off")?; + } // The boot timer device needs to be the first device attached in order // to maintain the same MMIO address referenced in the documentation diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index 1e2bd803e1d..d29f76740fc 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -114,6 +114,8 @@ pub struct VmResources { pub mmds_size_limit: usize, /// Whether or not to load boot timer device. pub boot_timer: bool, + /// Whether or not to use PCIe transport for VirtIO devices. + pub pci_enabled: bool, } impl VmResources { @@ -614,6 +616,7 @@ mod tests { boot_timer: false, mmds_size_limit: HTTP_MAX_PAYLOAD_SIZE, entropy: Default::default(), + pci_enabled: false, } } diff --git a/src/vmm/src/rpc_interface.rs b/src/vmm/src/rpc_interface.rs index e015152470e..d26b1ba877d 100644 --- a/src/vmm/src/rpc_interface.rs +++ b/src/vmm/src/rpc_interface.rs @@ -331,18 +331,16 @@ impl<'a> PrebootApiController<'a> { to_api: &std::sync::mpsc::Sender, api_event_fd: &vmm_sys_util::eventfd::EventFd, boot_timer_enabled: bool, + pci_enabled: bool, mmds_size_limit: usize, metadata_json: Option<&str>, ) -> Result<(VmResources, Arc>), BuildMicrovmFromRequestsError> { - let mut vm_resources = VmResources::default(); - // Silence false clippy warning. Clippy suggests using - // VmResources { boot_timer: boot_timer_enabled, ..Default::default() }; but this will - // generate build errors because VmResources contains private fields. - #[allow(clippy::field_reassign_with_default)] - { - vm_resources.mmds_size_limit = mmds_size_limit; - vm_resources.boot_timer = boot_timer_enabled; - } + let mut vm_resources = VmResources { + boot_timer: boot_timer_enabled, + mmds_size_limit, + pci_enabled, + ..Default::default() + }; // Init the data store from file, if present. if let Some(data) = metadata_json { diff --git a/src/vmm/src/vmm_config/boot_source.rs b/src/vmm/src/vmm_config/boot_source.rs index 37ba08be449..297f8abff04 100644 --- a/src/vmm/src/vmm_config/boot_source.rs +++ b/src/vmm/src/vmm_config/boot_source.rs @@ -9,14 +9,13 @@ use serde::{Deserialize, Serialize}; /// Default guest kernel command line: /// - `reboot=k` shut down the guest on reboot, instead of well... rebooting; /// - `panic=1` on panic, reboot after 1 second; -/// - `pci=off` do not scan for PCI devices (save boot time); /// - `nomodule` disable loadable kernel module support; /// - `8250.nr_uarts=0` disable 8250 serial interface; /// - `i8042.noaux` do not probe the i8042 controller for an attached mouse (save boot time); /// - `i8042.nomux` do not probe i8042 for a multiplexing controller (save boot time); /// - `i8042.dumbkbd` do not attempt to control kbd state via the i8042 (save boot time). pub const DEFAULT_KERNEL_CMDLINE: &str = - "reboot=k panic=1 pci=off nomodule 8250.nr_uarts=0 i8042.noaux i8042.nomux i8042.dumbkbd"; + "reboot=k panic=1 nomodule 8250.nr_uarts=0 i8042.noaux i8042.nomux i8042.dumbkbd"; /// Strongly typed data structure used to configure the boot source of the /// microvm. From 2c2c0110e29461b0cefc5844ac746d0b9736a808 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 13 May 2025 12:37:56 +0200 Subject: [PATCH 21/99] pci: add support for snapshotting PCI devices At the moment, the logic just restores the device manager and add the PCIe root complex if PCI is enabled. Signed-off-by: Babis Chalios --- src/vmm/src/device_manager/mod.rs | 9 +++++++++ src/vmm/src/device_manager/pci_mngr.rs | 24 ++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index 5c01a195fc5..2922060bb13 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -265,6 +265,8 @@ pub struct DevicesState { pub mmio_state: persist::DeviceStates, /// ACPI devices state pub acpi_state: persist::ACPIDeviceManagerState, + /// PCI devices state + pub pci_state: pci_mngr::PciDevicesState, } #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -273,6 +275,8 @@ pub enum DevicePersistError { MmioRestore(#[from] persist::DevicePersistError), /// Error restoring ACPI devices: {0} AcpiRestore(#[from] persist::ACPIDeviceManagerRestoreError), + /// Error restoring PCI devices: {0} + PciRestore(#[from] PciManagerError), /// Error notifying VMGenID device: {0} VmGenidUpdate(#[from] std::io::Error), /// Error resetting serial console: {0} @@ -295,6 +299,7 @@ impl DeviceManager { DevicesState { mmio_state: self.mmio_devices.save(), acpi_state: self.acpi_devices.save(), + pci_state: self.pci_devices.save(), } } @@ -366,6 +371,10 @@ impl DeviceManager { self.acpi_devices = ACPIDeviceManager::restore(acpi_ctor_args, &state.acpi_state)?; self.acpi_devices.notify_vmgenid()?; + // Restore PCI devices + self.pci_devices + .restore(&state.pci_state, &self.resource_allocator)?; + Ok(()) } } diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs index c3bf2ada977..e9ada60cc1f 100644 --- a/src/vmm/src/device_manager/pci_mngr.rs +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -3,6 +3,7 @@ use std::sync::Arc; +use serde::{Deserialize, Serialize}; use vm_device::BusError; use super::resources::ResourceAllocator; @@ -42,4 +43,27 @@ impl PciDevices { Ok(()) } + + pub fn save(&self) -> PciDevicesState { + PciDevicesState { + pci_enabled: self.pci_segment.is_some(), + } + } + + pub fn restore( + &mut self, + state: &PciDevicesState, + resource_allocator: &Arc, + ) -> Result<(), PciManagerError> { + if state.pci_enabled { + self.attach_pci_segment(resource_allocator)?; + } + + Ok(()) + } +} + +#[derive(Default, Debug, Clone, Serialize, Deserialize)] +pub struct PciDevicesState { + pci_enabled: bool, } From d3572314a0885745bbfeafa345dfa166b4a5cf43 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Mon, 12 May 2025 10:46:16 +0200 Subject: [PATCH 22/99] pci: add tests for PCIe root bus Add an integration test that checks that `lspci` correctly locates the PCIe root complex if PCI is enabled for the microVM. Also, add a negative test that checks that PCIe root complex doesn't exist when PCI is not enabled. Also, extend coverage of, at least some of, the tests to ensure that they run with and without PCI configuration enabled. Do that by extending the `uvm_any*` fixtures to yield both variants. Signed-off-by: Babis Chalios --- tests/conftest.py | 79 +++++++++++++++++-- tests/framework/microvm.py | 4 + .../integration_tests/functional/test_net.py | 4 +- .../integration_tests/functional/test_pci.py | 28 +++++++ .../integration_tests/functional/test_rng.py | 18 +++-- .../security/test_vulnerabilities.py | 8 +- 6 files changed, 126 insertions(+), 15 deletions(-) create mode 100644 tests/integration_tests/functional/test_pci.py diff --git a/tests/conftest.py b/tests/conftest.py index 4482a685155..bf4cb5c3649 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -569,12 +569,24 @@ def mem_size_mib(): return 256 +@pytest.fixture(params=[True, False]) +def pci_enabled(request): + """Fixture that allows configuring whether a microVM will have PCI enabled or not""" + yield request.param + + def uvm_booted( - microvm_factory, guest_kernel, rootfs, cpu_template, vcpu_count=2, mem_size_mib=256 + microvm_factory, + guest_kernel, + rootfs, + cpu_template, + pci_enabled, + vcpu_count=2, + mem_size_mib=256, ): """Return a booted uvm""" uvm = microvm_factory.build(guest_kernel, rootfs) - uvm.spawn() + uvm.spawn(pci=pci_enabled) uvm.basic_config(vcpu_count=vcpu_count, mem_size_mib=mem_size_mib) uvm.set_cpu_template(cpu_template) uvm.add_net_iface() @@ -582,9 +594,13 @@ def uvm_booted( return uvm -def uvm_restored(microvm_factory, guest_kernel, rootfs, cpu_template, **kwargs): +def uvm_restored( + microvm_factory, guest_kernel, rootfs, cpu_template, pci_enabled, **kwargs +): """Return a restored uvm""" - uvm = uvm_booted(microvm_factory, guest_kernel, rootfs, cpu_template, **kwargs) + uvm = uvm_booted( + microvm_factory, guest_kernel, rootfs, cpu_template, pci_enabled, **kwargs + ) snapshot = uvm.snapshot_full() uvm.kill() uvm2 = microvm_factory.build_from_snapshot(snapshot) @@ -605,6 +621,7 @@ def uvm_any( guest_kernel, rootfs, cpu_template_any, + pci_enabled, vcpu_count, mem_size_mib, ): @@ -614,6 +631,7 @@ def uvm_any( guest_kernel, rootfs, cpu_template_any, + pci_enabled, vcpu_count=vcpu_count, mem_size_mib=mem_size_mib, ) @@ -621,7 +639,13 @@ def uvm_any( @pytest.fixture def uvm_any_booted( - microvm_factory, guest_kernel, rootfs, cpu_template_any, vcpu_count, mem_size_mib + microvm_factory, + guest_kernel, + rootfs, + cpu_template_any, + pci_enabled, + vcpu_count, + mem_size_mib, ): """Return booted uvms""" return uvm_booted( @@ -629,6 +653,51 @@ def uvm_any_booted( guest_kernel, rootfs, cpu_template_any, + pci_enabled, + vcpu_count=vcpu_count, + mem_size_mib=mem_size_mib, + ) + + +@pytest.fixture +def uvm_any_with_pci( + uvm_ctor, + microvm_factory, + guest_kernel_acpi, + rootfs, + cpu_template_any, + vcpu_count, + mem_size_mib, +): + """Return booted uvms with PCI enabled""" + return uvm_ctor( + microvm_factory, + guest_kernel_acpi, + rootfs, + cpu_template_any, + True, + vcpu_count=vcpu_count, + mem_size_mib=mem_size_mib, + ) + + +@pytest.fixture +def uvm_any_without_pci( + uvm_ctor, + microvm_factory, + guest_kernel, + rootfs, + cpu_template_any, + vcpu_count, + mem_size_mib, +): + """Return booted uvms with PCI disabled""" + return uvm_ctor( + microvm_factory, + guest_kernel, + rootfs, + cpu_template_any, + False, vcpu_count=vcpu_count, mem_size_mib=mem_size_mib, ) diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py index 78f8d669600..7bed003a868 100644 --- a/tests/framework/microvm.py +++ b/tests/framework/microvm.py @@ -633,6 +633,7 @@ def spawn( log_show_origin=False, metrics_path="fc.ndjson", emit_metrics: bool = False, + pci: bool = False, ): """Start a microVM as a daemon or in a screen session.""" # pylint: disable=subprocess-run-check @@ -678,6 +679,9 @@ def spawn( # Checking the timings requires DEBUG level log messages self.time_api_requests = False + if pci: + self.jailer.extra_args["enable-pci"] = None + cmd = [ *self._pre_cmd, str(self.jailer_binary_path), diff --git a/tests/integration_tests/functional/test_net.py b/tests/integration_tests/functional/test_net.py index 7abf23406d5..10467affac8 100644 --- a/tests/integration_tests/functional/test_net.py +++ b/tests/integration_tests/functional/test_net.py @@ -85,9 +85,9 @@ def test_multi_queue_unsupported(uvm_plain): @pytest.fixture -def uvm_any(microvm_factory, uvm_ctor, guest_kernel, rootfs): +def uvm_any(microvm_factory, uvm_ctor, guest_kernel, rootfs, pci_enabled): """Return booted and restored uvm with no CPU templates""" - return uvm_ctor(microvm_factory, guest_kernel, rootfs, None) + return uvm_ctor(microvm_factory, guest_kernel, rootfs, None, pci_enabled) def test_tap_offload(uvm_any): diff --git a/tests/integration_tests/functional/test_pci.py b/tests/integration_tests/functional/test_pci.py new file mode 100644 index 00000000000..dc0827b1aae --- /dev/null +++ b/tests/integration_tests/functional/test_pci.py @@ -0,0 +1,28 @@ +# Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Tests for the PCI devices""" + + +def test_pci_root_present(uvm_any_with_pci): + """ + Test that a guest with PCI enabled has a PCI root device. + """ + + vm = uvm_any_with_pci + devices = vm.ssh.run("lspci").stdout.strip().split("\n") + print(devices) + assert devices[0].startswith( + "00:00.0 Host bridge: Intel Corporation Device" + ), "PCI root not found in guest" + + +def test_pci_disabled(uvm_any_without_pci): + """ + Test that a guest with PCI disabled does not have a PCI root device but still works. + """ + + vm = uvm_any_without_pci + _, stdout, _ = vm.ssh.run("lspci") + assert ( + "00:00.0 Host bridge: Intel Corporation Device" not in stdout + ), "PCI root not found in guest" diff --git a/tests/integration_tests/functional/test_rng.py b/tests/integration_tests/functional/test_rng.py index 1893230c51a..f2acf96735a 100644 --- a/tests/integration_tests/functional/test_rng.py +++ b/tests/integration_tests/functional/test_rng.py @@ -8,10 +8,12 @@ from host_tools.network import SSHConnection -def uvm_with_rng_booted(microvm_factory, guest_kernel, rootfs, rate_limiter): +def uvm_with_rng_booted( + microvm_factory, guest_kernel, rootfs, rate_limiter, pci_enabled +): """Return a booted microvm with virtio-rng configured""" uvm = microvm_factory.build(guest_kernel, rootfs) - uvm.spawn(log_level="INFO") + uvm.spawn(log_level="INFO", pci=pci_enabled) uvm.basic_config(vcpu_count=2, mem_size_mib=256) uvm.add_net_iface() uvm.api.entropy.put(rate_limiter=rate_limiter) @@ -21,9 +23,13 @@ def uvm_with_rng_booted(microvm_factory, guest_kernel, rootfs, rate_limiter): return uvm -def uvm_with_rng_restored(microvm_factory, guest_kernel, rootfs, rate_limiter): +def uvm_with_rng_restored( + microvm_factory, guest_kernel, rootfs, rate_limiter, pci_enabled +): """Return a restored uvm with virtio-rng configured""" - uvm = uvm_with_rng_booted(microvm_factory, guest_kernel, rootfs, rate_limiter) + uvm = uvm_with_rng_booted( + microvm_factory, guest_kernel, rootfs, rate_limiter, pci_enabled + ) snapshot = uvm.snapshot_full() uvm.kill() uvm2 = microvm_factory.build_from_snapshot(snapshot) @@ -44,9 +50,9 @@ def rate_limiter(request): @pytest.fixture -def uvm_any(microvm_factory, uvm_ctor, guest_kernel, rootfs, rate_limiter): +def uvm_any(microvm_factory, uvm_ctor, guest_kernel, rootfs, rate_limiter, pci_enabled): """Return booted and restored uvms""" - return uvm_ctor(microvm_factory, guest_kernel, rootfs, rate_limiter) + return uvm_ctor(microvm_factory, guest_kernel, rootfs, rate_limiter, pci_enabled) def list_rng_available(ssh_connection: SSHConnection) -> list[str]: diff --git a/tests/integration_tests/security/test_vulnerabilities.py b/tests/integration_tests/security/test_vulnerabilities.py index b15af03ab38..711db312d80 100644 --- a/tests/integration_tests/security/test_vulnerabilities.py +++ b/tests/integration_tests/security/test_vulnerabilities.py @@ -216,13 +216,17 @@ def microvm_factory_a(record_property): @pytest.fixture -def uvm_any_a(microvm_factory_a, uvm_ctor, guest_kernel, rootfs, cpu_template_any): +def uvm_any_a( + microvm_factory_a, uvm_ctor, guest_kernel, rootfs, cpu_template_any, pci_enabled +): """Return uvm with revision A firecracker Since pytest caches fixtures, this guarantees uvm_any_a will match a vm from uvm_any. See https://docs.pytest.org/en/stable/how-to/fixtures.html#fixtures-can-be-requested-more-than-once-per-test-return-values-are-cached """ - return uvm_ctor(microvm_factory_a, guest_kernel, rootfs, cpu_template_any) + return uvm_ctor( + microvm_factory_a, guest_kernel, rootfs, cpu_template_any, pci_enabled + ) def test_check_vulnerability_files_ab(request, uvm_any): From 3b30425c1f9c6e159cdfa5660a6d1d532db75024 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 21 May 2025 10:48:55 +0200 Subject: [PATCH 23/99] test: allow `extd_apicid` CPU feature on AMD guests PCI-enabled guest kernels enable the `extd_apicid` CPU feature for AMD CPU families after 16h. Our supported AMD families (Milan & Genoa) are both 19h. This is irrespective of whether PCI is enabled in Firecracker. Do not mark this as host-only when running with PCI enabled kernels, i.e. all kernels that support ACPI. Signed-off-by: Babis Chalios --- tests/framework/microvm.py | 2 ++ .../functional/test_cpu_features_host_vs_guest.py | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py index 7bed003a868..956889210c2 100644 --- a/tests/framework/microvm.py +++ b/tests/framework/microvm.py @@ -213,6 +213,7 @@ def __init__( assert microvm_id is not None self._microvm_id = microvm_id + self.pci_enabled = False self.kernel_file = None self.rootfs_file = None self.ssh_key = None @@ -680,6 +681,7 @@ def spawn( self.time_api_requests = False if pci: + self.pci_enabled = True self.jailer.extra_args["enable-pci"] = None cmd = [ diff --git a/tests/integration_tests/functional/test_cpu_features_host_vs_guest.py b/tests/integration_tests/functional/test_cpu_features_host_vs_guest.py index 955a70bd38b..78ea0380f1b 100644 --- a/tests/integration_tests/functional/test_cpu_features_host_vs_guest.py +++ b/tests/integration_tests/functional/test_cpu_features_host_vs_guest.py @@ -91,7 +91,6 @@ "cqm_occup_llc", "decodeassists", "extapic", - "extd_apicid", "flushbyasid", "hw_pstate", "ibs", From 4ef00edeafba25ab4a2b15b7e5ea68d20235ccf4 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 21 May 2025 15:12:06 +0200 Subject: [PATCH 24/99] test: add Rust integration tests for PCI-enabled uVMs We have some Rust integration tests that check building and booting of microVMs works correctly. Add variants for PCI-enabled microVMs. Signed-off-by: Babis Chalios --- src/vmm/src/test_utils/mod.rs | 19 +++-- src/vmm/tests/integration_tests.rs | 111 +++++++++++++++++------------ 2 files changed, 82 insertions(+), 48 deletions(-) diff --git a/src/vmm/src/test_utils/mod.rs b/src/vmm/src/test_utils/mod.rs index fb936fe7659..2cfcc274b5d 100644 --- a/src/vmm/src/test_utils/mod.rs +++ b/src/vmm/src/test_utils/mod.rs @@ -69,6 +69,7 @@ pub fn create_vmm( _kernel_image: Option<&str>, is_diff: bool, boot_microvm: bool, + pci_enabled: bool, ) -> (Arc>, EventManager) { let mut event_manager = EventManager::new().unwrap(); let empty_seccomp_filters = get_empty_filters(); @@ -82,7 +83,7 @@ pub fn create_vmm( None => boot_source_cfg.into(), }; let mock_vm_res = MockVmResources::new().with_boot_source(boot_source_cfg); - let resources: VmResources = if is_diff { + let mut resources: VmResources = if is_diff { mock_vm_res .with_vm_config(MockVmConfig::new().with_dirty_page_tracking().into()) .into() @@ -90,6 +91,8 @@ pub fn create_vmm( mock_vm_res.into() }; + resources.pci_enabled = pci_enabled; + let vmm = build_microvm_for_boot( &InstanceInfo::default(), &resources, @@ -106,15 +109,23 @@ pub fn create_vmm( } pub fn default_vmm(kernel_image: Option<&str>) -> (Arc>, EventManager) { - create_vmm(kernel_image, false, true) + create_vmm(kernel_image, false, true, false) } pub fn default_vmm_no_boot(kernel_image: Option<&str>) -> (Arc>, EventManager) { - create_vmm(kernel_image, false, false) + create_vmm(kernel_image, false, false, false) +} + +pub fn default_vmm_pci_no_boot(kernel_image: Option<&str>) -> (Arc>, EventManager) { + create_vmm(kernel_image, false, false, true) } pub fn dirty_tracking_vmm(kernel_image: Option<&str>) -> (Arc>, EventManager) { - create_vmm(kernel_image, true, true) + create_vmm(kernel_image, true, true, false) +} + +pub fn default_vmm_pci(kernel_image: Option<&str>) -> (Arc>, EventManager) { + create_vmm(kernel_image, false, true, false) } #[allow(clippy::undocumented_unsafe_blocks)] diff --git a/src/vmm/tests/integration_tests.rs b/src/vmm/tests/integration_tests.rs index 02612743beb..4dd993d7c90 100644 --- a/src/vmm/tests/integration_tests.rs +++ b/src/vmm/tests/integration_tests.rs @@ -4,6 +4,7 @@ #![allow(clippy::cast_possible_truncation, clippy::tests_outside_test_module)] use std::io::{Seek, SeekFrom}; +use std::sync::{Arc, Mutex}; use std::thread; use std::time::Duration; @@ -17,7 +18,9 @@ use vmm::rpc_interface::{ use vmm::seccomp::get_empty_filters; use vmm::snapshot::Snapshot; use vmm::test_utils::mock_resources::{MockVmResources, NOISY_KERNEL_IMAGE}; -use vmm::test_utils::{create_vmm, default_vmm, default_vmm_no_boot}; +use vmm::test_utils::{ + create_vmm, default_vmm, default_vmm_no_boot, default_vmm_pci, default_vmm_pci_no_boot, +}; use vmm::vmm_config::balloon::BalloonDeviceConfig; use vmm::vmm_config::boot_source::BootSourceConfig; use vmm::vmm_config::drive::BlockDeviceConfig; @@ -28,9 +31,24 @@ use vmm::vmm_config::snapshot::{ CreateSnapshotParams, LoadSnapshotParams, MemBackendConfig, MemBackendType, SnapshotType, }; use vmm::vmm_config::vsock::VsockDeviceConfig; -use vmm::{DumpCpuConfigError, EventManager, FcExitCode}; +use vmm::{DumpCpuConfigError, EventManager, FcExitCode, Vmm}; use vmm_sys_util::tempfile::TempFile; +#[allow(unused_mut, unused_variables)] +fn check_booted_microvm(vmm: Arc>, mut evmgr: EventManager) { + // On x86_64, the vmm should exit once its workload completes and signals the exit event. + // On aarch64, the test kernel doesn't exit, so the vmm is force-stopped. + #[cfg(target_arch = "x86_64")] + evmgr.run_with_timeout(500).unwrap(); + #[cfg(target_arch = "aarch64")] + vmm.lock().unwrap().stop(FcExitCode::Ok); + + assert_eq!( + vmm.lock().unwrap().shutdown_exit_code(), + Some(FcExitCode::Ok) + ); +} + #[test] fn test_build_and_boot_microvm() { // Error case: no boot source configured. @@ -49,25 +67,17 @@ fn test_build_and_boot_microvm() { } // Success case. - let (vmm, mut _evmgr) = default_vmm(None); + let (vmm, evmgr) = default_vmm(None); + check_booted_microvm(vmm, evmgr); - // On x86_64, the vmm should exit once its workload completes and signals the exit event. - // On aarch64, the test kernel doesn't exit, so the vmm is force-stopped. - #[cfg(target_arch = "x86_64")] - _evmgr.run_with_timeout(500).unwrap(); - #[cfg(target_arch = "aarch64")] - vmm.lock().unwrap().stop(FcExitCode::Ok); - - assert_eq!( - vmm.lock().unwrap().shutdown_exit_code(), - Some(FcExitCode::Ok) - ); + // microVM with PCI + let (vmm, evmgr) = default_vmm_pci(None); + check_booted_microvm(vmm, evmgr); } -#[test] -fn test_build_microvm() { +#[allow(unused_mut, unused_variables)] +fn check_build_microvm(vmm: Arc>, mut evmgr: EventManager) { // The built microVM should be in the `VmState::Paused` state here. - let (vmm, mut _evtmgr) = default_vmm_no_boot(None); assert_eq!(vmm.lock().unwrap().instance_info().state, VmState::Paused); // The microVM should be able to resume and exit successfully. @@ -75,7 +85,7 @@ fn test_build_microvm() { // On aarch64, the test kernel doesn't exit, so the vmm is force-stopped. vmm.lock().unwrap().resume_vm().unwrap(); #[cfg(target_arch = "x86_64")] - _evtmgr.run_with_timeout(500).unwrap(); + evmgr.run_with_timeout(500).unwrap(); #[cfg(target_arch = "aarch64")] vmm.lock().unwrap().stop(FcExitCode::Ok); assert_eq!( @@ -85,10 +95,14 @@ fn test_build_microvm() { } #[test] -fn test_pause_resume_microvm() { - // Tests that pausing and resuming a microVM work as expected. - let (vmm, _) = default_vmm(None); +fn test_build_microvm() { + let (vmm, evtmgr) = default_vmm_no_boot(None); + check_build_microvm(vmm, evtmgr); + let (vmm, evtmgr) = default_vmm_pci_no_boot(None); + check_build_microvm(vmm, evtmgr); +} +fn pause_resume_microvm(vmm: Arc>) { let mut api_controller = RuntimeApiController::new(VmResources::default(), vmm.clone()); // There's a race between this thread and the vcpu thread, but this thread @@ -102,6 +116,17 @@ fn test_pause_resume_microvm() { vmm.lock().unwrap().stop(FcExitCode::Ok); } +#[test] +fn test_pause_resume_microvm() { + // Tests that pausing and resuming a microVM work as expected. + let (vmm, _) = default_vmm(None); + + pause_resume_microvm(vmm); + + let (vmm, _) = default_vmm_pci(None); + pause_resume_microvm(vmm); +} + #[test] #[cfg(target_arch = "x86_64")] fn test_dirty_bitmap_success() { @@ -170,11 +195,11 @@ fn test_disallow_dump_cpu_config_without_pausing() { vmm.lock().unwrap().stop(FcExitCode::Ok); } -fn verify_create_snapshot(is_diff: bool) -> (TempFile, TempFile) { +fn verify_create_snapshot(is_diff: bool, pci_enabled: bool) -> (TempFile, TempFile) { let snapshot_file = TempFile::new().unwrap(); let memory_file = TempFile::new().unwrap(); - let (vmm, _) = create_vmm(Some(NOISY_KERNEL_IMAGE), is_diff, true); + let (vmm, _) = create_vmm(Some(NOISY_KERNEL_IMAGE), is_diff, true, pci_enabled); let resources = VmResources { machine_config: MachineConfig { mem_size_mib: 1, @@ -281,29 +306,27 @@ fn verify_load_snapshot(snapshot_file: TempFile, memory_file: TempFile) { #[test] fn test_create_and_load_snapshot() { - // Create diff snapshot. - let (snapshot_file, memory_file) = verify_create_snapshot(true); - // Create a new microVm from snapshot. This only tests code-level logic; it verifies - // that a microVM can be built with no errors from given snapshot. - // It does _not_ verify that the guest is actually restored properly. We're using - // python integration tests for that. - verify_load_snapshot(snapshot_file, memory_file); - - // Create full snapshot. - let (snapshot_file, memory_file) = verify_create_snapshot(false); - // Create a new microVm from snapshot. This only tests code-level logic; it verifies - // that a microVM can be built with no errors from given snapshot. - // It does _not_ verify that the guest is actually restored properly. We're using - // python integration tests for that. - verify_load_snapshot(snapshot_file, memory_file); + for (diff_snap, pci_enabled) in [(false, false), (false, true), (true, false), (true, true)] { + // Create snapshot. + let (snapshot_file, memory_file) = verify_create_snapshot(diff_snap, pci_enabled); + // Create a new microVm from snapshot. This only tests code-level logic; it verifies + // that a microVM can be built with no errors from given snapshot. + // It does _not_ verify that the guest is actually restored properly. We're using + // python integration tests for that. + verify_load_snapshot(snapshot_file, memory_file); + } } #[test] fn test_snapshot_load_sanity_checks() { - use vmm::persist::SnapShotStateSanityCheckError; - - let mut microvm_state = get_microvm_state_from_snapshot(); + let microvm_state = get_microvm_state_from_snapshot(false); + check_snapshot(microvm_state); + let microvm_state = get_microvm_state_from_snapshot(true); + check_snapshot(microvm_state); +} +fn check_snapshot(mut microvm_state: MicrovmState) { + use vmm::persist::SnapShotStateSanityCheckError; snapshot_state_sanity_check(µvm_state).unwrap(); // Remove memory regions. @@ -316,9 +339,9 @@ fn test_snapshot_load_sanity_checks() { ); } -fn get_microvm_state_from_snapshot() -> MicrovmState { +fn get_microvm_state_from_snapshot(pci_enabled: bool) -> MicrovmState { // Create a diff snapshot - let (snapshot_file, _) = verify_create_snapshot(true); + let (snapshot_file, _) = verify_create_snapshot(true, pci_enabled); // Deserialize the microVM state. let snapshot_file_metadata = snapshot_file.as_file().metadata().unwrap(); @@ -329,7 +352,7 @@ fn get_microvm_state_from_snapshot() -> MicrovmState { } fn verify_load_snap_disallowed_after_boot_resources(res: VmmAction, res_name: &str) { - let (snapshot_file, memory_file) = verify_create_snapshot(false); + let (snapshot_file, memory_file) = verify_create_snapshot(false, false); let mut event_manager = EventManager::new().unwrap(); let empty_seccomp_filters = get_empty_filters(); From 4f4f5eb12b9b8118e5a30b0643ab11b8848f874c Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 21 May 2025 17:48:18 +0200 Subject: [PATCH 25/99] test: temporarily disable security A/B tests for PCI uVMs Tests test_spectre_meltdown_checker_on_guest and test_check_vulnerability_files_ab run A/B tests between the HEAD of the target branch and the tip of a PR branch. This will currently fail, because Firecracker builds from the HEAD of the target branch know nothing about the `--enable-pci` command line flag, so launching the Firecracker binary for revision A will fail. Only run these tests for non-PCI uVMs for now. Once this commit gets merged we will re-enable and make sure that everything works as expected. Signed-off-by: Babis Chalios --- .../security/test_vulnerabilities.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/tests/integration_tests/security/test_vulnerabilities.py b/tests/integration_tests/security/test_vulnerabilities.py index 711db312d80..01b8e9c595b 100644 --- a/tests/integration_tests/security/test_vulnerabilities.py +++ b/tests/integration_tests/security/test_vulnerabilities.py @@ -216,22 +216,18 @@ def microvm_factory_a(record_property): @pytest.fixture -def uvm_any_a( - microvm_factory_a, uvm_ctor, guest_kernel, rootfs, cpu_template_any, pci_enabled -): +def uvm_any_a(microvm_factory_a, uvm_ctor, guest_kernel, rootfs, cpu_template_any): """Return uvm with revision A firecracker Since pytest caches fixtures, this guarantees uvm_any_a will match a vm from uvm_any. See https://docs.pytest.org/en/stable/how-to/fixtures.html#fixtures-can-be-requested-more-than-once-per-test-return-values-are-cached """ - return uvm_ctor( - microvm_factory_a, guest_kernel, rootfs, cpu_template_any, pci_enabled - ) + return uvm_ctor(microvm_factory_a, guest_kernel, rootfs, cpu_template_any, False) -def test_check_vulnerability_files_ab(request, uvm_any): +def test_check_vulnerability_files_ab(request, uvm_any_without_pci): """Test vulnerability files on guests""" - res_b = check_vulnerabilities_files_on_guest(uvm_any) + res_b = check_vulnerabilities_files_on_guest(uvm_any_without_pci) if global_props.buildkite_pr: # we only get the uvm_any_a fixtures if we need it uvm_a = request.getfixturevalue("uvm_any_a") @@ -243,11 +239,11 @@ def test_check_vulnerability_files_ab(request, uvm_any): def test_spectre_meltdown_checker_on_guest( request, - uvm_any, + uvm_any_without_pci, spectre_meltdown_checker, ): """Test with the spectre / meltdown checker on any supported guest.""" - res_b = spectre_meltdown_checker.get_report_for_guest(uvm_any) + res_b = spectre_meltdown_checker.get_report_for_guest(uvm_any_without_pci) if global_props.buildkite_pr: # we only get the uvm_any_a fixtures if we need it uvm_a = request.getfixturevalue("uvm_any_a") @@ -255,5 +251,5 @@ def test_spectre_meltdown_checker_on_guest( assert res_b <= res_a else: assert res_b == spectre_meltdown_checker.expected_vulnerabilities( - uvm_any.cpu_template_name + uvm_any_without_pci.cpu_template_name ) From 2f3229d0721ef0f4d5b8aa18b86982b236003185 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Wed, 13 Nov 2024 12:13:22 +0000 Subject: [PATCH 26/99] test: update ci artifacts to support PCI devices 1. build the kernel with PCI/e support. 2. fix a race condition between udev renaming the network devices and fcnet setting up the network interfaces 3. install pciutils on the image Signed-off-by: Riccardo Mancini Signed-off-by: Babis Chalios --- resources/chroot.sh | 2 +- resources/guest_configs/pcie.config | 8 ++++++++ resources/overlay/etc/systemd/system/fcnet.service | 1 + resources/rebuild.sh | 11 ++++++----- 4 files changed, 16 insertions(+), 6 deletions(-) create mode 100644 resources/guest_configs/pcie.config diff --git a/resources/chroot.sh b/resources/chroot.sh index 93f6ca754f0..f87ae4aea08 100755 --- a/resources/chroot.sh +++ b/resources/chroot.sh @@ -11,7 +11,7 @@ PS4='+\t ' cp -ruv $rootfs/* / -packages="udev systemd-sysv openssh-server iproute2 curl socat python3-minimal iperf3 iputils-ping fio kmod tmux hwloc-nox vim-tiny trace-cmd linuxptp strace python3-boto3" +packages="udev systemd-sysv openssh-server iproute2 curl socat python3-minimal iperf3 iputils-ping fio kmod tmux hwloc-nox vim-tiny trace-cmd linuxptp strace python3-boto3 pciutils" # msr-tools is only supported on x86-64. arch=$(uname -m) diff --git a/resources/guest_configs/pcie.config b/resources/guest_configs/pcie.config new file mode 100644 index 00000000000..b7262f7ae73 --- /dev/null +++ b/resources/guest_configs/pcie.config @@ -0,0 +1,8 @@ +CONFIG_BLK_MQ_PCI=y +CONFIG_PCI=y +CONFIG_PCI_MMCONFIG=y +CONFIG_PCI_MSI=y +CONFIG_PCIEPORTBUS=y +CONFIG_VIRTIO_PCI=y +CONFIG_PCI_HOST_COMMON=y +CONFIG_PCI_HOST_GENERIC=y diff --git a/resources/overlay/etc/systemd/system/fcnet.service b/resources/overlay/etc/systemd/system/fcnet.service index 26d3af1dc20..ace1c8322e1 100644 --- a/resources/overlay/etc/systemd/system/fcnet.service +++ b/resources/overlay/etc/systemd/system/fcnet.service @@ -1,5 +1,6 @@ [Service] Type=oneshot +ExecStartPre=/usr/bin/udevadm settle ExecStart=/usr/local/bin/fcnet-setup.sh [Install] WantedBy=sshd.service diff --git a/resources/rebuild.sh b/resources/rebuild.sh index f7215af371e..a38cad79852 100755 --- a/resources/rebuild.sh +++ b/resources/rebuild.sh @@ -247,15 +247,16 @@ function build_al_kernels { clone_amazon_linux_repo CI_CONFIG="$PWD/guest_configs/ci.config" + PCIE_CONFIG="$PWD/guest_configs/pcie.config" if [[ "$KERNEL_VERSION" == @(all|5.10) ]]; then - build_al_kernel $PWD/guest_configs/microvm-kernel-ci-$ARCH-5.10.config "$CI_CONFIG" + build_al_kernel $PWD/guest_configs/microvm-kernel-ci-$ARCH-5.10.config "$CI_CONFIG" "$PCIE_CONFIG" fi if [[ $ARCH == "x86_64" && "$KERNEL_VERSION" == @(all|5.10-no-acpi) ]]; then - build_al_kernel $PWD/guest_configs/microvm-kernel-ci-$ARCH-5.10-no-acpi.config "$CI_CONFIG" + build_al_kernel $PWD/guest_configs/microvm-kernel-ci-$ARCH-5.10-no-acpi.config "$CI_CONFIG" "$PCIE_CONFIG" fi if [[ "$KERNEL_VERSION" == @(all|6.1) ]]; then - build_al_kernel $PWD/guest_configs/microvm-kernel-ci-$ARCH-6.1.config "$CI_CONFIG" + build_al_kernel $PWD/guest_configs/microvm-kernel-ci-$ARCH-6.1.config "$CI_CONFIG" "$PCIE_CONFIG" fi # Build debug kernels @@ -264,11 +265,11 @@ function build_al_kernels { OUTPUT_DIR=$OUTPUT_DIR/debug mkdir -pv $OUTPUT_DIR if [[ "$KERNEL_VERSION" == @(all|5.10) ]]; then - build_al_kernel "$PWD/guest_configs/microvm-kernel-ci-$ARCH-5.10.config" "$CI_CONFIG" "$FTRACE_CONFIG" "$DEBUG_CONFIG" + build_al_kernel "$PWD/guest_configs/microvm-kernel-ci-$ARCH-5.10.config" "$CI_CONFIG" "$PCIE_CONFIG" "$FTRACE_CONFIG" "$DEBUG_CONFIG" vmlinux_split_debuginfo $OUTPUT_DIR/vmlinux-5.10.* fi if [[ "$KERNEL_VERSION" == @(all|6.1) ]]; then - build_al_kernel "$PWD/guest_configs/microvm-kernel-ci-$ARCH-6.1.config" "$CI_CONFIG" "$FTRACE_CONFIG" "$DEBUG_CONFIG" + build_al_kernel "$PWD/guest_configs/microvm-kernel-ci-$ARCH-6.1.config" "$CI_CONFIG" "$PCIE_CONFIG" "$FTRACE_CONFIG" "$DEBUG_CONFIG" vmlinux_split_debuginfo $OUTPUT_DIR/vmlinux-6.1.* fi } From b4397a5d21eb5c874ee6c320b0d7bbe53f4f7b63 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Thu, 22 May 2025 15:50:29 +0200 Subject: [PATCH 27/99] tests: fix MMIO gaps in memory monitor tool The memory monitor was only assuming a single MMIO gap on x86_64 when calculating the memory regions that corresponded to guest memory. Now we need to account for two MMIO gaps in x86 and one in ARM. Signed-off-by: Babis Chalios --- tests/host_tools/memory.py | 79 ++++++++++++++++++++++++++++++-------- 1 file changed, 62 insertions(+), 17 deletions(-) diff --git a/tests/host_tools/memory.py b/tests/host_tools/memory.py index 93380a9321d..eacc14ac48a 100644 --- a/tests/host_tools/memory.py +++ b/tests/host_tools/memory.py @@ -8,6 +8,8 @@ import psutil +from framework.properties import global_props + class MemoryUsageExceededError(Exception): """A custom exception containing details on excessive memory usage.""" @@ -15,8 +17,8 @@ class MemoryUsageExceededError(Exception): def __init__(self, usage, threshold, *args): """Compose the error message containing the memory consumption.""" super().__init__( - f"Memory usage ({usage / 2**20:.2f} MiB) exceeded maximum threshold " - f"({threshold / 2**20} MiB)", + f"Memory usage ({usage / 1 << 20:.2f} MiB) exceeded maximum threshold " + f"({threshold / 1 << 20} MiB)", *args, ) @@ -28,10 +30,20 @@ class MemoryMonitor(Thread): VMM memory usage. """ - # If guest memory is >3328MB, it is split in a 2nd region - X86_MEMORY_GAP_START = 3328 * 2**20 - - def __init__(self, vm, threshold=5 * 2**20, period_s=0.05): + # If guest memory is >3GiB, it is split in a 2nd region + # Gap starts at 3GiBs and is 1GiB long + X86_32BIT_MEMORY_GAP_START = 3 << 30 + X86_32BIT_MEMORY_GAP_SIZE = 1 << 30 + # If guest memory is >255GiB, it is split in a 3rd region + # Gap starts at 256 GiB and is 256GiB long + X86_64BIT_MEMORY_GAP_START = 256 << 30 + # On ARM64 we just have a single gap, but memory starts at an offset + # Gap starts at 256 GiB and is GiB long + # Memory starts at 2GiB + ARM64_64BIT_MEMORY_GAP_START = 256 << 30 + ARM64_MEMORY_START = 2 << 30 + + def __init__(self, vm, threshold=5 << 20, period_s=0.01): """Initialize monitor attributes.""" Thread.__init__(self) self._vm = vm @@ -72,7 +84,9 @@ def run(self): mem_total = 0 for mmap in mmaps: if self.is_guest_mem(mmap.size, guest_mem_bytes): + print(f"Region {mmap} is guest memory") continue + mem_total += mmap.rss self._current_rss = mem_total if mem_total > self.threshold: @@ -81,24 +95,55 @@ def run(self): time.sleep(self._period_s) - def is_guest_mem(self, size, guest_mem_bytes): + def is_guest_mem_x86(self, size, guest_mem_bytes): """ - If the address is recognised as a guest memory region, - return True, otherwise return False. + Checks if a region is a guest memory region based on + x86_64 physical memory layout """ + return size in ( + # memory fits before the first gap + guest_mem_bytes, + # guest memory spans at least two regions & memory fits before the second gap + self.X86_32BIT_MEMORY_GAP_START, + # guest memory spans exactly two regions + guest_mem_bytes - self.X86_32BIT_MEMORY_GAP_START, + # guest memory fills the space between the two gaps + self.X86_64BIT_MEMORY_GAP_START + - self.X86_32BIT_MEMORY_GAP_START + - self.X86_32BIT_MEMORY_GAP_SIZE, + # guest memory spans 3 regions, this is what remains past the second gap + guest_mem_bytes + - self.X86_64BIT_MEMORY_GAP_START + + self.X86_32BIT_MEMORY_GAP_SIZE, + ) - # If x86_64 guest memory exceeds 3328M, it will be split - # in 2 regions: 3328M and the rest. We have 3 cases here - # to recognise a guest memory region: - # - its size matches the guest memory exactly - # - its size is 3328M - # - its size is guest memory minus 3328M. + def is_guest_mem_arch64(self, size, guest_mem_bytes): + """ + Checks if a region is a guest memory region based on + ARM64 physical memory layout + """ return size in ( + # guest memory fits before the gap guest_mem_bytes, - self.X86_MEMORY_GAP_START, - guest_mem_bytes - self.X86_MEMORY_GAP_START, + # guest memory fills the space before the gap + self.ARM64_64BIT_MEMORY_GAP_START - self.ARM64_MEMORY_START, + # guest memory spans 2 regions, this is what remains past the gap + guest_mem_bytes + - self.ARM64_64BIT_MEMORY_GAP_START + + self.ARM64_MEMORY_START, ) + def is_guest_mem(self, size, guest_mem_bytes): + """ + If the address is recognised as a guest memory region, + return True, otherwise return False. + """ + + if global_props.cpu_architecture == "x86_64": + return self.is_guest_mem_x86(size, guest_mem_bytes) + + return self.is_guest_mem_arch64(size, guest_mem_bytes) + def check_samples(self): """Check that there are no samples over the threshold.""" if self._exceeded is not None: From 2c6774dd9adedc243158554ea2898224ce4d8890 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 4 Jun 2025 12:33:10 +0200 Subject: [PATCH 28/99] fix: boottimer device MMIO address When we re-arranged the MMIO address space in commit 9a165d17f1ba (arch: define 64-bit capable MMIO memory regions) we moved the MMIO region of the boot timer device for x86 systems, but we didn't update the init scripts that hardcode it and use it to report boot time timestamp back to Firecracker. Update the init.c and initramfs values for the region. Also, add a functional test that runs during CI PR tests and makes sure the boot timer works. Signed-off-by: Babis Chalios --- resources/overlay/usr/local/bin/init.c | 2 +- resources/rebuild.sh | 2 +- .../performance/test_boottime.py | 42 +++++++++++++------ 3 files changed, 31 insertions(+), 15 deletions(-) diff --git a/resources/overlay/usr/local/bin/init.c b/resources/overlay/usr/local/bin/init.c index caa3e9d91d5..4d469171ae5 100644 --- a/resources/overlay/usr/local/bin/init.c +++ b/resources/overlay/usr/local/bin/init.c @@ -13,7 +13,7 @@ // Position on the bus is defined by MMIO_LEN increments, where MMIO_LEN is // defined as 0x1000 in vmm/src/device_manager/mmio.rs. #ifdef __x86_64__ -#define MAGIC_MMIO_SIGNAL_GUEST_BOOT_COMPLETE 0xd0000000 +#define MAGIC_MMIO_SIGNAL_GUEST_BOOT_COMPLETE 0xc0000000 #endif #ifdef __aarch64__ #define MAGIC_MMIO_SIGNAL_GUEST_BOOT_COMPLETE 0x40000000 diff --git a/resources/rebuild.sh b/resources/rebuild.sh index a38cad79852..38313b6d0f0 100755 --- a/resources/rebuild.sh +++ b/resources/rebuild.sh @@ -109,7 +109,7 @@ function build_initramfs { # Report guest boot time back to Firecracker via MMIO # See arch/src/lib.rs and the BootTimer device - MAGIC_BOOT_ADDRESS=0xd0000000 + MAGIC_BOOT_ADDRESS=0xc0000000 if [ $ARCH = "aarch64" ]; then MAGIC_BOOT_ADDRESS=0x40000000 fi diff --git a/tests/integration_tests/performance/test_boottime.py b/tests/integration_tests/performance/test_boottime.py index 77812738b17..69cacfd094a 100644 --- a/tests/integration_tests/performance/test_boottime.py +++ b/tests/integration_tests/performance/test_boottime.py @@ -94,6 +94,33 @@ def to_ms(v, unit): return kernel, userspace, total +def launch_vm_with_boot_timer( + microvm_factory, guest_kernel_acpi, rootfs_rw, vcpu_count, mem_size_mib +): + """Launches a microVM with guest-timer and returns the reported metrics for it""" + vm = microvm_factory.build(guest_kernel_acpi, rootfs_rw) + vm.jailer.extra_args.update({"boot-timer": None}) + vm.spawn() + vm.basic_config( + vcpu_count=vcpu_count, + mem_size_mib=mem_size_mib, + boot_args=DEFAULT_BOOT_ARGS + " init=/usr/local/bin/init", + enable_entropy_device=True, + ) + vm.add_net_iface() + vm.start() + vm.pin_threads(0) + + boot_time_us, cpu_boot_time_us = get_boottime_device_info(vm) + + return (vm, boot_time_us, cpu_boot_time_us) + + +def test_boot_timer(microvm_factory, guest_kernel_acpi, rootfs): + """Tests that the boot timer device works""" + launch_vm_with_boot_timer(microvm_factory, guest_kernel_acpi, rootfs, 1, 128) + + @pytest.mark.parametrize( "vcpu_count,mem_size_mib", [(1, 128), (1, 1024), (2, 2048), (4, 4096)], @@ -105,20 +132,9 @@ def test_boottime( """Test boot time with different guest configurations""" for i in range(10): - vm = microvm_factory.build(guest_kernel_acpi, rootfs_rw) - vm.jailer.extra_args.update({"boot-timer": None}) - vm.spawn() - vm.basic_config( - vcpu_count=vcpu_count, - mem_size_mib=mem_size_mib, - boot_args=DEFAULT_BOOT_ARGS + " init=/usr/local/bin/init", - enable_entropy_device=True, + vm, boot_time_us, cpu_boot_time_us = launch_vm_with_boot_timer( + microvm_factory, guest_kernel_acpi, rootfs_rw, vcpu_count, mem_size_mib ) - vm.add_net_iface() - vm.start() - vm.pin_threads(0) - - boot_time_us, cpu_boot_time_us = get_boottime_device_info(vm) if i == 0: metrics.set_dimensions( From 319086f35e7a80c6af320e3e07548f0d6baa15a5 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 4 Jun 2025 16:13:32 +0200 Subject: [PATCH 29/99] test: remove logging from memory monitor Commit be5a600e (tests: fix MMIO gaps in memory monitor tool) that fixed the memory monitor to account for the 64-bit MMIO region included a left-over debug print. Remove it. Signed-off-by: Babis Chalios --- tests/host_tools/memory.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/host_tools/memory.py b/tests/host_tools/memory.py index eacc14ac48a..d9c2a01fe06 100644 --- a/tests/host_tools/memory.py +++ b/tests/host_tools/memory.py @@ -84,7 +84,6 @@ def run(self): mem_total = 0 for mmap in mmaps: if self.is_guest_mem(mmap.size, guest_mem_bytes): - print(f"Region {mmap} is guest memory") continue mem_total += mmap.rss From f6f2c4df0bf073f38193c70cdd5ddfe0059be48b Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Fri, 30 May 2025 18:14:21 +0200 Subject: [PATCH 30/99] chore: update kvm and vmm-sys-util dependencies We need the new KvmIrqRouting FamStruct wrapper from kvm-bindings, which though forces us to update vmm-sys-util to 0.14.0 and also bump all downstream dependencies of vmm-sys-util to use that version. Signed-off-by: Babis Chalios --- Cargo.lock | 116 +++++++++++++++++++++++++++++++------ src/firecracker/Cargo.toml | 5 +- 2 files changed, 103 insertions(+), 18 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0e44d6a7596..91e60fd3c54 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -98,7 +98,7 @@ version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6c8bdeb6047d8983be085bab0ba1472e6dc604e7041dbf6fcd5e71523014fae9" dependencies = [ - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -109,7 +109,7 @@ checksum = "403f75924867bb1033c59fbf0797484329750cfbe3c4325cd33127941fabc882" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -218,7 +218,7 @@ dependencies = [ "bitflags 2.9.1", "cexpr", "clang-sys", - "itertools 0.10.5", + "itertools 0.12.1", "lazy_static", "lazycell", "log", @@ -595,7 +595,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "778e2ac28f6c47af28e4907f13ffd1e1ddbd400980a9abd7c8df189bf578a5ad" dependencies = [ "libc", - "windows-sys", + "windows-sys 0.60.2", ] [[package]] @@ -727,7 +727,7 @@ version = "0.5.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589533453244b0995c858700322199b2becb13b627df2851f64a2775d024abcf" dependencies = [ - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -764,6 +764,15 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + [[package]] name = "itertools" version = "0.13.0" @@ -892,7 +901,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667" dependencies = [ "cfg-if", - "windows-targets", + "windows-targets 0.53.2", ] [[package]] @@ -1234,7 +1243,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -1808,7 +1817,7 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" dependencies = [ - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -1823,7 +1832,16 @@ version = "0.59.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" dependencies = [ - "windows-targets", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" +dependencies = [ + "windows-targets 0.53.2", ] [[package]] @@ -1832,14 +1850,30 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_gnullvm", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm 0.52.6", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.53.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c66f69fcc9ce11da9966ddb31a40968cad001c5bedeb5c2b82ede4253ab48aef" +dependencies = [ + "windows_aarch64_gnullvm 0.53.0", + "windows_aarch64_msvc 0.53.0", + "windows_i686_gnu 0.53.0", + "windows_i686_gnullvm 0.53.0", + "windows_i686_msvc 0.53.0", + "windows_x86_64_gnu 0.53.0", + "windows_x86_64_gnullvm 0.53.0", + "windows_x86_64_msvc 0.53.0", ] [[package]] @@ -1848,48 +1882,96 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764" + [[package]] name = "windows_aarch64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c" + [[package]] name = "windows_i686_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" +[[package]] +name = "windows_i686_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3" + [[package]] name = "windows_i686_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11" + [[package]] name = "windows_i686_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" +[[package]] +name = "windows_i686_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d" + [[package]] name = "windows_x86_64_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba" + [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57" + [[package]] name = "windows_x86_64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" + [[package]] name = "winnow" version = "0.7.12" diff --git a/src/firecracker/Cargo.toml b/src/firecracker/Cargo.toml index c7e6c6a5d2e..74812a0f66d 100644 --- a/src/firecracker/Cargo.toml +++ b/src/firecracker/Cargo.toml @@ -42,7 +42,10 @@ serde_json = "1.0.142" [dev-dependencies] cargo_toml = "0.22.3" libc = "0.2.174" -regex = { version = "1.11.1", default-features = false, features = ["std", "unicode-perl"] } +regex = { version = "1.11.1", default-features = false, features = [ + "std", + "unicode-perl", +] } # Dev-Dependencies for uffd examples serde = { version = "1.0.219", features = ["derive"] } From c7a99f0d4152896d3e1bea8fb7145d92f2612213 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 3 Jun 2025 18:14:33 +0200 Subject: [PATCH 31/99] pci: fixes in PCI crate Define thiserror::Error and displaydoc::Display for various error types in the vended PCI crate. This way we can embed them in our error types downstream. Also export a few types and struct fields that were private and we will be needing them. Signed-off-by: Babis Chalios --- Cargo.lock | 1 + src/pci/Cargo.toml | 1 + src/pci/src/bus.rs | 4 ++-- src/pci/src/configuration.rs | 6 +++--- src/pci/src/device.rs | 29 +++++------------------------ src/pci/src/lib.rs | 11 +++++++---- src/pci/src/msix.rs | 18 +++++++++++++++--- 7 files changed, 34 insertions(+), 36 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 91e60fd3c54..8f20f8f2d50 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1047,6 +1047,7 @@ name = "pci" version = "0.1.0" dependencies = [ "byteorder", + "displaydoc", "libc", "log", "serde", diff --git a/src/pci/Cargo.toml b/src/pci/Cargo.toml index c88cd270b23..3549d5010fe 100644 --- a/src/pci/Cargo.toml +++ b/src/pci/Cargo.toml @@ -13,6 +13,7 @@ default = [] [dependencies] byteorder = "1.5.0" +displaydoc = "0.2.5" libc = "0.2.172" log = "0.4.27" serde = { version = "1.0.219", features = ["derive"] } diff --git a/src/pci/src/bus.rs b/src/pci/src/bus.rs index cb42b4ee9c5..775238edff9 100644 --- a/src/pci/src/bus.rs +++ b/src/pci/src/bus.rs @@ -24,7 +24,7 @@ const DEVICE_ID_INTEL_VIRT_PCIE_HOST: u16 = 0x0d57; const NUM_DEVICE_IDS: usize = 32; /// Errors for device manager. -#[derive(Debug)] +#[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum PciRootError { /// Could not allocate device address space for the device. AllocateDeviceAddrs(PciDeviceError), @@ -103,7 +103,7 @@ impl PciDevice for PciRoot { pub struct PciBus { /// Devices attached to this bus. /// Device 0 is host bridge. - devices: HashMap>>, + pub devices: HashMap>>, device_reloc: Arc, device_ids: Vec, } diff --git a/src/pci/src/configuration.rs b/src/pci/src/configuration.rs index 3a53167148c..c37f8026fbe 100644 --- a/src/pci/src/configuration.rs +++ b/src/pci/src/configuration.rs @@ -409,7 +409,7 @@ struct PciBar { r#type: Option, } -#[derive(Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct PciConfigurationState { registers: Vec, writable_bits: Vec, @@ -466,7 +466,7 @@ impl From for PciBarType { } } -#[derive(Copy, Clone)] +#[derive(Debug, Copy, Clone, Serialize, Deserialize)] pub enum PciBarPrefetchable { NotPrefetchable = 0, Prefetchable = 0x08, @@ -481,7 +481,7 @@ impl From for bool { } } -#[derive(Copy, Clone)] +#[derive(Debug, Copy, Clone, Serialize, Deserialize)] pub struct PciBarConfiguration { addr: u64, size: u64, diff --git a/src/pci/src/device.rs b/src/pci/src/device.rs index d3bd3056a36..bf89331faa9 100644 --- a/src/pci/src/device.rs +++ b/src/pci/src/device.rs @@ -6,7 +6,6 @@ // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause use std::any::Any; -use std::fmt::{self, Display}; use std::sync::{Arc, Barrier}; use std::{io, result}; @@ -16,39 +15,21 @@ use vm_device::Resource; use crate::configuration::{self, PciBarRegionType}; use crate::PciBarConfiguration; -#[derive(Debug)] +#[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum Error { - /// Setup of the device capabilities failed. + /// Setup of the device capabilities failed: {0}. CapabilitiesSetup(configuration::Error), - /// Allocating space for an IO BAR failed. + /// Allocating space for an IO BAR failed, size={0}. IoAllocationFailed(u64), - /// Registering an IO BAR failed. + /// Registering an IO BAR at address {0} failed: {1} IoRegistrationFailed(u64, configuration::Error), /// Expected resource not found. MissingResource, - /// Invalid resource. + /// Invalid resource InvalidResource(Resource), } pub type Result = std::result::Result; -impl Display for Error { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - use self::Error::*; - - match self { - CapabilitiesSetup(e) => write!(f, "failed to add capability {e}"), - IoAllocationFailed(size) => { - write!(f, "failed to allocate space for an IO BAR, size={size}") - } - IoRegistrationFailed(addr, e) => { - write!(f, "failed to register an IO BAR, addr={addr} err={e}") - } - MissingResource => write!(f, "failed to find expected resource"), - InvalidResource(r) => write!(f, "invalid resource {r:?}"), - } - } -} - #[derive(Clone, Copy)] pub struct BarReprogrammingParams { pub old_base: u64, diff --git a/src/pci/src/lib.rs b/src/pci/src/lib.rs index 2672159e474..3162da292de 100644 --- a/src/pci/src/lib.rs +++ b/src/pci/src/lib.rs @@ -24,15 +24,18 @@ use serde::de::Visitor; pub use self::bus::{PciBus, PciConfigIo, PciConfigMmio, PciRoot, PciRootError}; pub use self::configuration::{ PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciCapability, PciCapabilityId, - PciClassCode, PciConfiguration, PciExpressCapabilityId, PciHeaderType, PciMassStorageSubclass, - PciNetworkControllerSubclass, PciProgrammingInterface, PciSerialBusSubClass, PciSubclass, - PCI_CONFIGURATION_ID, + PciClassCode, PciConfiguration, PciConfigurationState, PciExpressCapabilityId, PciHeaderType, + PciMassStorageSubclass, PciNetworkControllerSubclass, PciProgrammingInterface, + PciSerialBusSubClass, PciSubclass, PCI_CONFIGURATION_ID, }; pub use self::device::{ BarReprogrammingParams, DeviceRelocation, Error as PciDeviceError, PciDevice, }; pub use self::msi::{msi_num_enabled_vectors, MsiCap, MsiConfig}; -pub use self::msix::{MsixCap, MsixConfig, MsixTableEntry, MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE}; +pub use self::msix::{ + Error as MsixError, MsixCap, MsixConfig, MsixConfigState, MsixTableEntry, MSIX_CONFIG_ID, + MSIX_TABLE_ENTRY_SIZE, +}; /// PCI has four interrupt pins A->D. #[derive(Copy, Clone)] diff --git a/src/pci/src/msix.rs b/src/pci/src/msix.rs index 4b3cf688980..be5aa3b8cf1 100644 --- a/src/pci/src/msix.rs +++ b/src/pci/src/msix.rs @@ -26,7 +26,7 @@ const MSIX_ENABLE_MASK: u16 = (1 << MSIX_ENABLE_BIT) as u16; pub const MSIX_TABLE_ENTRY_SIZE: usize = 16; pub const MSIX_CONFIG_ID: &str = "msix_config"; -#[derive(Debug)] +#[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum Error { /// Failed enabling the interrupt route. EnableInterruptRoute(io::Error), @@ -59,7 +59,7 @@ impl Default for MsixTableEntry { } } -#[derive(Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct MsixConfigState { table_entries: Vec, pba_entries: Vec, @@ -71,11 +71,23 @@ pub struct MsixConfig { pub table_entries: Vec, pub pba_entries: Vec, pub devid: u32, - interrupt_source_group: Arc, + pub interrupt_source_group: Arc, masked: bool, enabled: bool, } +impl std::fmt::Debug for MsixConfig { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("MsixConfig") + .field("table_entries", &self.table_entries) + .field("pba_entries", &self.pba_entries) + .field("devid", &self.devid) + .field("masked", &self.masked) + .field("enabled", &self.enabled) + .finish() + } +} + impl MsixConfig { pub fn new( msix_vectors: u16, From 31887615e162af0eceea67cbba8070359bdb7a4f Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 10 Jun 2025 17:05:27 +0200 Subject: [PATCH 32/99] vm-device: return reference to EventFd from Interrupt trait Instead of returning an `EventFd` type, which will actually force us to clone the file descriptor in the Firecracker side. Signed-off-by: Babis Chalios --- src/vm-device/src/interrupt/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vm-device/src/interrupt/mod.rs b/src/vm-device/src/interrupt/mod.rs index f4aec52a2e0..da5d87a4e1a 100644 --- a/src/vm-device/src/interrupt/mod.rs +++ b/src/vm-device/src/interrupt/mod.rs @@ -172,7 +172,7 @@ pub trait InterruptSourceGroup: Send + Sync { /// to inject interrupts into a guest, by writing to the file returned /// by this method. #[allow(unused_variables)] - fn notifier(&self, index: InterruptIndex) -> Option; + fn notifier(&self, index: InterruptIndex) -> Option<&EventFd>; /// Update the interrupt source group configuration. /// From fba4a49f8f8581ed22b439267ae1aa7d4c1e2aee Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 25 Jun 2025 12:57:06 +0200 Subject: [PATCH 33/99] cleanup: remove unused code from pci and vm-device crates This is code we are not going to use in Firecracker. Remove it, so we can keep the crates we vend as minimal as possible, including only things we are actually using. Signed-off-by: Babis Chalios --- src/pci/src/lib.rs | 2 - src/pci/src/msi.rs | 282 --------------------------- src/vm-device/src/dma_mapping/mod.rs | 18 -- src/vm-device/src/lib.rs | 1 - 4 files changed, 303 deletions(-) delete mode 100644 src/pci/src/msi.rs delete mode 100644 src/vm-device/src/dma_mapping/mod.rs diff --git a/src/pci/src/lib.rs b/src/pci/src/lib.rs index 3162da292de..f1dec5b126a 100644 --- a/src/pci/src/lib.rs +++ b/src/pci/src/lib.rs @@ -12,7 +12,6 @@ extern crate log; mod bus; mod configuration; mod device; -mod msi; mod msix; use std::fmt::{self, Debug, Display}; @@ -31,7 +30,6 @@ pub use self::configuration::{ pub use self::device::{ BarReprogrammingParams, DeviceRelocation, Error as PciDeviceError, PciDevice, }; -pub use self::msi::{msi_num_enabled_vectors, MsiCap, MsiConfig}; pub use self::msix::{ Error as MsixError, MsixCap, MsixConfig, MsixConfigState, MsixTableEntry, MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE, diff --git a/src/pci/src/msi.rs b/src/pci/src/msi.rs deleted file mode 100644 index 16d593cd115..00000000000 --- a/src/pci/src/msi.rs +++ /dev/null @@ -1,282 +0,0 @@ -// Copyright © 2019 Intel Corporation -// -// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause -// - -use std::io; -use std::sync::Arc; - -use byteorder::{ByteOrder, LittleEndian}; -use serde::{Deserialize, Serialize}; -use thiserror::Error; -use vm_device::interrupt::{ - InterruptIndex, InterruptSourceConfig, InterruptSourceGroup, MsiIrqSourceConfig, -}; - -// MSI control masks -const MSI_CTL_ENABLE: u16 = 0x1; -const MSI_CTL_MULTI_MSG_ENABLE: u16 = 0x70; -const MSI_CTL_64_BITS: u16 = 0x80; -const MSI_CTL_PER_VECTOR: u16 = 0x100; - -// MSI message offsets -const MSI_MSG_CTL_OFFSET: u64 = 0x2; -const MSI_MSG_ADDR_LO_OFFSET: u64 = 0x4; - -// MSI message masks -const MSI_MSG_ADDR_LO_MASK: u32 = 0xffff_fffc; - -pub fn msi_num_enabled_vectors(msg_ctl: u16) -> usize { - let field = (msg_ctl >> 4) & 0x7; - - if field > 5 { - return 0; - } - - 1 << field -} - -#[derive(Error, Debug)] -pub enum Error { - #[error("Failed enabling the interrupt route: {0}")] - EnableInterruptRoute(io::Error), - #[error("Failed updating the interrupt route: {0}")] - UpdateInterruptRoute(io::Error), -} - -#[derive(Clone, Copy, Default, Serialize, Deserialize)] -pub struct MsiCap { - // Message Control Register - // 0: MSI enable. - // 3-1; Multiple message capable. - // 6-4: Multiple message enable. - // 7: 64 bits address capable. - // 8: Per-vector masking capable. - // 15-9: Reserved. - pub msg_ctl: u16, - // Message Address (LSB) - // 1-0: Reserved. - // 31-2: Message address. - pub msg_addr_lo: u32, - // Message Upper Address (MSB) - // 31-0: Message address. - pub msg_addr_hi: u32, - // Message Data - // 15-0: Message data. - pub msg_data: u16, - // Mask Bits - // 31-0: Mask bits. - pub mask_bits: u32, - // Pending Bits - // 31-0: Pending bits. - pub pending_bits: u32, -} - -impl MsiCap { - fn addr_64_bits(&self) -> bool { - self.msg_ctl & MSI_CTL_64_BITS == MSI_CTL_64_BITS - } - - fn per_vector_mask(&self) -> bool { - self.msg_ctl & MSI_CTL_PER_VECTOR == MSI_CTL_PER_VECTOR - } - - fn enabled(&self) -> bool { - self.msg_ctl & MSI_CTL_ENABLE == MSI_CTL_ENABLE - } - - fn num_enabled_vectors(&self) -> usize { - msi_num_enabled_vectors(self.msg_ctl) - } - - fn vector_masked(&self, vector: usize) -> bool { - if !self.per_vector_mask() { - return false; - } - - (self.mask_bits >> vector) & 0x1 == 0x1 - } - - fn size(&self) -> u64 { - let mut size: u64 = 0xa; - - if self.addr_64_bits() { - size += 0x4; - } - if self.per_vector_mask() { - size += 0xa; - } - - size - } - - fn update(&mut self, offset: u64, data: &[u8]) { - // Calculate message data offset depending on the address being 32 or - // 64 bits. - // Calculate upper address offset if the address is 64 bits. - // Calculate mask bits offset based on the address being 32 or 64 bits - // and based on the per vector masking being enabled or not. - let (msg_data_offset, addr_hi_offset, mask_bits_offset): (u64, Option, Option) = - if self.addr_64_bits() { - let mask_bits = if self.per_vector_mask() { - Some(0x10) - } else { - None - }; - (0xc, Some(0x8), mask_bits) - } else { - let mask_bits = if self.per_vector_mask() { - Some(0xc) - } else { - None - }; - (0x8, None, mask_bits) - }; - - // Update cache without overriding the read-only bits. - match data.len() { - 2 => { - let value = LittleEndian::read_u16(data); - match offset { - MSI_MSG_CTL_OFFSET => { - self.msg_ctl = (self.msg_ctl & !(MSI_CTL_ENABLE | MSI_CTL_MULTI_MSG_ENABLE)) - | (value & (MSI_CTL_ENABLE | MSI_CTL_MULTI_MSG_ENABLE)) - } - x if x == msg_data_offset => self.msg_data = value, - _ => error!("invalid offset"), - } - } - 4 => { - let value = LittleEndian::read_u32(data); - match offset { - 0x0 => { - self.msg_ctl = (self.msg_ctl & !(MSI_CTL_ENABLE | MSI_CTL_MULTI_MSG_ENABLE)) - | ((value >> 16) as u16 & (MSI_CTL_ENABLE | MSI_CTL_MULTI_MSG_ENABLE)) - } - MSI_MSG_ADDR_LO_OFFSET => self.msg_addr_lo = value & MSI_MSG_ADDR_LO_MASK, - x if x == msg_data_offset => self.msg_data = value as u16, - x if addr_hi_offset.is_some() && x == addr_hi_offset.unwrap() => { - self.msg_addr_hi = value - } - x if mask_bits_offset.is_some() && x == mask_bits_offset.unwrap() => { - self.mask_bits = value - } - _ => error!("invalid offset"), - } - } - _ => error!("invalid data length"), - } - } -} - -#[derive(Serialize, Deserialize)] -pub struct MsiConfigState { - cap: MsiCap, -} - -pub struct MsiConfig { - pub cap: MsiCap, - interrupt_source_group: Arc, -} - -impl MsiConfig { - pub fn new( - msg_ctl: u16, - interrupt_source_group: Arc, - state: Option, - ) -> Result { - let cap = if let Some(state) = state { - if state.cap.enabled() { - for idx in 0..state.cap.num_enabled_vectors() { - let config = MsiIrqSourceConfig { - high_addr: state.cap.msg_addr_hi, - low_addr: state.cap.msg_addr_lo, - data: state.cap.msg_data as u32, - devid: 0, - }; - - interrupt_source_group - .update( - idx as InterruptIndex, - InterruptSourceConfig::MsiIrq(config), - state.cap.vector_masked(idx), - false, - ) - .map_err(Error::UpdateInterruptRoute)?; - } - - interrupt_source_group - .set_gsi() - .map_err(Error::EnableInterruptRoute)?; - - interrupt_source_group - .enable() - .map_err(Error::EnableInterruptRoute)?; - } - - state.cap - } else { - MsiCap { - msg_ctl, - ..Default::default() - } - }; - - Ok(MsiConfig { - cap, - interrupt_source_group, - }) - } - - pub fn state(&self) -> MsiConfigState { - MsiConfigState { cap: self.cap } - } - - pub fn enabled(&self) -> bool { - self.cap.enabled() - } - - pub fn size(&self) -> u64 { - self.cap.size() - } - - pub fn num_enabled_vectors(&self) -> usize { - self.cap.num_enabled_vectors() - } - - pub fn update(&mut self, offset: u64, data: &[u8]) { - let old_enabled = self.cap.enabled(); - - self.cap.update(offset, data); - - if self.cap.enabled() { - for idx in 0..self.num_enabled_vectors() { - let config = MsiIrqSourceConfig { - high_addr: self.cap.msg_addr_hi, - low_addr: self.cap.msg_addr_lo, - data: self.cap.msg_data as u32, - devid: 0, - }; - - if let Err(e) = self.interrupt_source_group.update( - idx as InterruptIndex, - InterruptSourceConfig::MsiIrq(config), - self.cap.vector_masked(idx), - true, - ) { - error!("Failed updating vector: {:?}", e); - } - } - - if !old_enabled { - if let Err(e) = self.interrupt_source_group.enable() { - error!("Failed enabling irq_fd: {:?}", e); - } - } - } else if old_enabled { - if let Err(e) = self.interrupt_source_group.disable() { - error!("Failed disabling irq_fd: {:?}", e); - } - } - } -} diff --git a/src/vm-device/src/dma_mapping/mod.rs b/src/vm-device/src/dma_mapping/mod.rs deleted file mode 100644 index 6cba6e16488..00000000000 --- a/src/vm-device/src/dma_mapping/mod.rs +++ /dev/null @@ -1,18 +0,0 @@ -// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. -// Copyright © 2021 Intel Corporation -// -// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause - -/// Trait to trigger DMA mapping updates for devices managed by virtio-iommu -/// -/// Trait meant for triggering the DMA mapping update related to an external -/// device not managed fully through virtio. It is dedicated to virtio-iommu -/// in order to trigger the map update anytime the mapping is updated from the -/// guest. -pub trait ExternalDmaMapping: Send + Sync { - /// Map a memory range - fn map(&self, iova: u64, gpa: u64, size: u64) -> std::result::Result<(), std::io::Error>; - - /// Unmap a memory range - fn unmap(&self, iova: u64, size: u64) -> std::result::Result<(), std::io::Error>; -} diff --git a/src/vm-device/src/lib.rs b/src/vm-device/src/lib.rs index fe06fd8b465..b980b09c4b9 100644 --- a/src/vm-device/src/lib.rs +++ b/src/vm-device/src/lib.rs @@ -9,7 +9,6 @@ use serde::{Deserialize, Serialize}; mod bus; -pub mod dma_mapping; pub mod interrupt; pub use self::bus::{Bus, BusDevice, BusDeviceSync, Error as BusError}; From 5c42eac1e4d33dfda1be1487575020f5c7acf282 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Mon, 2 Jun 2025 18:46:32 +0200 Subject: [PATCH 34/99] refactor: allow storing Arc within Vmm We'd like to be able to store Vm within an atomic reference so we can pass it around and share it with other components. The main issue with doing this change is that we need Vm to be `mut` during initialization and the builder.rs code was creating Vmm with Vm embedded in it. To solve this, we break down the initialization of the Vmm object. We first create its individual parts (Vm, Kvm and DeviceManager), perform any necessary initialization logic on Vm and once this done add it within an Arc. Signed-off-by: Babis Chalios --- src/vmm/src/arch/aarch64/mod.rs | 23 +- src/vmm/src/arch/x86_64/mod.rs | 26 ++- src/vmm/src/builder.rs | 309 +++++++++++++++----------- src/vmm/src/device_manager/acpi.rs | 4 +- src/vmm/src/device_manager/mmio.rs | 13 +- src/vmm/src/device_manager/mod.rs | 12 + src/vmm/src/device_manager/persist.rs | 1 + src/vmm/src/lib.rs | 3 +- 8 files changed, 230 insertions(+), 161 deletions(-) diff --git a/src/vmm/src/arch/aarch64/mod.rs b/src/vmm/src/arch/aarch64/mod.rs index df6e712dcf5..a599db5dea7 100644 --- a/src/vmm/src/arch/aarch64/mod.rs +++ b/src/vmm/src/arch/aarch64/mod.rs @@ -32,7 +32,7 @@ use crate::utils::{align_up, u64_to_usize, usize_to_u64}; use crate::vmm_config::machine_config::MachineConfig; use crate::vstate::memory::{Address, Bytes, GuestAddress, GuestMemory, GuestMemoryMmap}; use crate::vstate::vcpu::KvmVcpuError; -use crate::{Vcpu, VcpuConfig, Vmm, logger}; +use crate::{DeviceManager, Kvm, Vcpu, VcpuConfig, Vm, logger}; /// Errors thrown while configuring aarch64 system. #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -82,8 +82,11 @@ pub fn arch_memory_regions(size: usize) -> Vec<(GuestAddress, usize)> { } /// Configures the system for booting Linux. +#[allow(clippy::too_many_arguments)] pub fn configure_system_for_boot( - vmm: &mut Vmm, + kvm: &Kvm, + vm: &Vm, + device_manager: &mut DeviceManager, vcpus: &mut [Vcpu], machine_config: &MachineConfig, cpu_template: &CustomCpuTemplate, @@ -103,11 +106,11 @@ pub fn configure_system_for_boot( cpu_config, }; - let optional_capabilities = vmm.kvm.optional_capabilities(); + let optional_capabilities = kvm.optional_capabilities(); // Configure vCPUs with normalizing and setting the generated CPU configuration. for vcpu in vcpus.iter_mut() { vcpu.kvm_vcpu.configure( - vmm.vm.guest_memory(), + vm.guest_memory(), entry_point, &vcpu_config, &optional_capabilities, @@ -123,18 +126,16 @@ pub fn configure_system_for_boot( .expect("Cannot create cstring from cmdline string"); let fdt = fdt::create_fdt( - vmm.vm.guest_memory(), + vm.guest_memory(), vcpu_mpidr, cmdline, - &vmm.device_manager, - vmm.vm.get_irqchip(), + device_manager, + vm.get_irqchip(), initrd, )?; - let fdt_address = GuestAddress(get_fdt_addr(vmm.vm.guest_memory())); - vmm.vm - .guest_memory() - .write_slice(fdt.as_slice(), fdt_address)?; + let fdt_address = GuestAddress(get_fdt_addr(vm.guest_memory())); + vm.guest_memory().write_slice(fdt.as_slice(), fdt_address)?; Ok(()) } diff --git a/src/vmm/src/arch/x86_64/mod.rs b/src/vmm/src/arch/x86_64/mod.rs index fe1296e5d1c..68b903d5ff6 100644 --- a/src/vmm/src/arch/x86_64/mod.rs +++ b/src/vmm/src/arch/x86_64/mod.rs @@ -33,6 +33,7 @@ pub mod generated; use std::fs::File; +use kvm::Kvm; use layout::{ CMDLINE_START, FIRST_ADDR_PAST_32BITS, FIRST_ADDR_PAST_64BITS_MMIO, MMIO32_MEM_SIZE, MMIO32_MEM_START, MMIO64_MEM_SIZE, MMIO64_MEM_START, PCI_MMCONFIG_SIZE, PCI_MMCONFIG_START, @@ -53,6 +54,7 @@ use crate::acpi::create_acpi_tables; use crate::arch::{BootProtocol, SYSTEM_MEM_SIZE, SYSTEM_MEM_START, arch_memory_regions_with_gap}; use crate::cpu_config::templates::{CustomCpuTemplate, GuestConfigError}; use crate::cpu_config::x86_64::CpuConfiguration; +use crate::device_manager::DeviceManager; use crate::initrd::InitrdConfig; use crate::utils::{align_down, u64_to_usize, usize_to_u64}; use crate::vmm_config::machine_config::MachineConfig; @@ -60,7 +62,7 @@ use crate::vstate::memory::{ Address, GuestAddress, GuestMemory, GuestMemoryMmap, GuestMemoryRegion, }; use crate::vstate::vcpu::KvmVcpuConfigureError; -use crate::{Vcpu, VcpuConfig, Vmm, logger}; +use crate::{Vcpu, VcpuConfig, Vm, logger}; // Value taken from https://elixir.bootlin.com/linux/v5.10.68/source/arch/x86/include/uapi/asm/e820.h#L31 // Usable normal RAM @@ -169,8 +171,11 @@ pub fn initrd_load_addr(guest_mem: &GuestMemoryMmap, initrd_size: usize) -> Opti } /// Configures the system for booting Linux. +#[allow(clippy::too_many_arguments)] pub fn configure_system_for_boot( - vmm: &mut Vmm, + kvm: &Kvm, + vm: &Vm, + device_manager: &mut DeviceManager, vcpus: &mut [Vcpu], machine_config: &MachineConfig, cpu_template: &CustomCpuTemplate, @@ -179,8 +184,7 @@ pub fn configure_system_for_boot( boot_cmdline: Cmdline, ) -> Result<(), ConfigurationError> { // Construct the base CpuConfiguration to apply CPU template onto. - let cpu_config = - CpuConfiguration::new(vmm.kvm.supported_cpuid.clone(), cpu_template, &vcpus[0])?; + let cpu_config = CpuConfiguration::new(kvm.supported_cpuid.clone(), cpu_template, &vcpus[0])?; // Apply CPU template to the base CpuConfiguration. let cpu_config = CpuConfiguration::apply_template(cpu_config, cpu_template)?; @@ -193,7 +197,7 @@ pub fn configure_system_for_boot( // Configure vCPUs with normalizing and setting the generated CPU configuration. for vcpu in vcpus.iter_mut() { vcpu.kvm_vcpu - .configure(vmm.vm.guest_memory(), entry_point, &vcpu_config)?; + .configure(vm.guest_memory(), entry_point, &vcpu_config)?; } // Write the kernel command line to guest memory. This is x86_64 specific, since on @@ -204,7 +208,7 @@ pub fn configure_system_for_boot( .expect("Cannot create cstring from cmdline string"); load_cmdline( - vmm.vm.guest_memory(), + vm.guest_memory(), GuestAddress(crate::arch::x86_64::layout::CMDLINE_START), &boot_cmdline, ) @@ -212,19 +216,19 @@ pub fn configure_system_for_boot( // Note that this puts the mptable at the last 1k of Linux's 640k base RAM mptable::setup_mptable( - vmm.vm.guest_memory(), - &vmm.device_manager.resource_allocator, + vm.guest_memory(), + &device_manager.resource_allocator, vcpu_config.vcpu_count, ) .map_err(ConfigurationError::MpTableSetup)?; match entry_point.protocol { BootProtocol::PvhBoot => { - configure_pvh(vmm.vm.guest_memory(), GuestAddress(CMDLINE_START), initrd)?; + configure_pvh(vm.guest_memory(), GuestAddress(CMDLINE_START), initrd)?; } BootProtocol::LinuxBoot => { configure_64bit_boot( - vmm.vm.guest_memory(), + vm.guest_memory(), GuestAddress(CMDLINE_START), cmdline_size, initrd, @@ -234,7 +238,7 @@ pub fn configure_system_for_boot( // Create ACPI tables and write them in guest memory // For the time being we only support ACPI in x86_64 - create_acpi_tables(vmm.vm.guest_memory(), &mut vmm.device_manager, vcpus)?; + create_acpi_tables(vm.guest_memory(), device_manager, vcpus)?; Ok(()) } diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index be590ded918..3f47b743063 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -16,17 +16,18 @@ use utils::time::TimestampUs; #[cfg(target_arch = "aarch64")] use vm_memory::GuestAddress; +#[cfg(target_arch = "aarch64")] +use crate::Vcpu; use crate::arch::{ConfigurationError, configure_system_for_boot, load_kernel}; #[cfg(target_arch = "aarch64")] use crate::construct_kvm_mpidrs; -use crate::cpu_config::templates::{ - GetCpuTemplate, GetCpuTemplateError, GuestConfigError, KvmCapability, -}; +use crate::cpu_config::templates::{GetCpuTemplate, GetCpuTemplateError, GuestConfigError}; #[cfg(target_arch = "aarch64")] use crate::device_manager::AttachLegacyMmioDeviceError; use crate::device_manager::pci_mngr::PciManagerError; use crate::device_manager::{ - AttachMmioDeviceError, AttachVmgenidError, DeviceManager, DevicePersistError, DeviceRestoreArgs, + AttachMmioDeviceError, AttachVmgenidError, DeviceManager, DeviceManagerCreateError, + DevicePersistError, DeviceRestoreArgs, }; use crate::devices::acpi::vmgenid::VmGenIdError; use crate::devices::virtio::balloon::Balloon; @@ -43,10 +44,10 @@ use crate::resources::VmResources; use crate::seccomp::BpfThreadMap; use crate::vmm_config::instance_info::InstanceInfo; use crate::vmm_config::machine_config::MachineConfigError; -use crate::vstate::kvm::Kvm; +use crate::vstate::kvm::{Kvm, KvmError}; use crate::vstate::memory::GuestRegionMmap; -use crate::vstate::vcpu::{Vcpu, VcpuError}; -use crate::vstate::vm::Vm; +use crate::vstate::vcpu::VcpuError; +use crate::vstate::vm::{Vm, VmError}; use crate::{EventManager, Vmm, VmmError, device_manager}; /// Errors associated with starting the instance. @@ -61,6 +62,8 @@ pub enum StartMicrovmError { AttachLegacyDevices(#[from] AttachLegacyMmioDeviceError), /// System configuration error: {0} ConfigureSystem(#[from] ConfigurationError), + /// Failed to create device manager: {0} + CreateDeviceManager(#[from] DeviceManagerCreateError), /// Failed to create guest config: {0} CreateGuestConfig(#[from] GuestConfigError), /// Cannot create network device: {0} @@ -87,6 +90,8 @@ pub enum StartMicrovmError { GetCpuTemplate(#[from] GetCpuTemplateError), /// Invalid kernel command line: {0} KernelCmdline(String), + /// Kvm error: {0} + Kvm(#[from] KvmError), /// Cannot load command line string: {0} LoadCommandline(linux_loader::loader::Error), /// Cannot start microvm without kernel configuration. @@ -115,6 +120,8 @@ pub enum StartMicrovmError { /// Error cloning Vcpu fds #[cfg(feature = "gdb")] VcpuFdCloneError(#[from] crate::vstate::vcpu::CopyKvmFdError), + /// Error with the Vm object: {0} + Vm(#[from] VmError), } /// It's convenient to automatically convert `linux_loader::cmdline::Error`s @@ -125,37 +132,6 @@ impl std::convert::From for StartMicrovmError { } } -#[cfg_attr(target_arch = "aarch64", allow(unused))] -fn create_vmm_and_vcpus( - instance_info: &InstanceInfo, - event_manager: &mut EventManager, - vcpu_count: u8, - kvm_capabilities: Vec, -) -> Result<(Vmm, Vec), VmmError> { - let kvm = Kvm::new(kvm_capabilities)?; - // Set up Kvm Vm and register memory regions. - // Build custom CPU config if a custom template is provided. - let mut vm = Vm::new(&kvm)?; - - let (vcpus, vcpus_exit_evt) = vm.create_vcpus(vcpu_count)?; - - let device_manager = DeviceManager::new(event_manager, &vcpus_exit_evt, vm.fd())?; - - let vmm = Vmm { - events_observer: Some(std::io::stdin()), - instance_info: instance_info.clone(), - shutdown_exit_code: None, - kvm, - vm, - uffd: None, - vcpus_handles: Vec::new(), - vcpus_exit_evt, - device_manager, - }; - - Ok((vmm, vcpus)) -} - /// Builds and starts a microVM based on the current Firecracker VmResources configuration. /// /// The built microVM and all the created vCPUs start off in the paused state. @@ -167,8 +143,6 @@ pub fn build_microvm_for_boot( event_manager: &mut EventManager, seccomp_filters: &BpfThreadMap, ) -> Result>, StartMicrovmError> { - use self::StartMicrovmError::*; - // Timestamp for measuring microVM boot duration. let request_ts = TimestampUs::default(); @@ -176,7 +150,7 @@ pub fn build_microvm_for_boot( .boot_source .builder .as_ref() - .ok_or(MissingKernelConfig)?; + .ok_or(StartMicrovmError::MissingKernelConfig)?; let guest_memory = vm_resources .allocate_guest_memory() @@ -191,19 +165,17 @@ pub fn build_microvm_for_boot( .cpu_template .get_cpu_template()?; - let (mut vmm, mut vcpus) = create_vmm_and_vcpus( - instance_info, - event_manager, - vm_resources.machine_config.vcpu_count, - cpu_template.kvm_capabilities.clone(), - )?; + let kvm = Kvm::new(cpu_template.kvm_capabilities.clone())?; + // Set up Kvm Vm and register memory regions. + // Build custom CPU config if a custom template is provided. + let mut vm = Vm::new(&kvm)?; + let (mut vcpus, vcpus_exit_evt) = vm.create_vcpus(vm_resources.machine_config.vcpu_count)?; + vm.register_memory_regions(guest_memory)?; - vmm.vm - .register_memory_regions(guest_memory) - .map_err(VmmError::Vm)?; + let mut device_manager = DeviceManager::new(event_manager, &vcpus_exit_evt, vm.fd())?; - let entry_point = load_kernel(&boot_config.kernel_file, vmm.vm.guest_memory())?; - let initrd = InitrdConfig::from_config(boot_config, vmm.vm.guest_memory())?; + let entry_point = load_kernel(&boot_config.kernel_file, vm.guest_memory())?; + let initrd = InitrdConfig::from_config(boot_config, vm.guest_memory())?; #[cfg(feature = "gdb")] let (gdb_tx, gdb_rx) = mpsc::channel(); @@ -214,11 +186,11 @@ pub fn build_microvm_for_boot( #[cfg(feature = "gdb")] let vcpu_fds = vcpus .iter() - .map(|vcpu| vcpu.copy_kvm_vcpu_fd(vmm.vm())) + .map(|vcpu| vcpu.copy_kvm_vcpu_fd(&vm)) .collect::, _>>()?; if vm_resources.pci_enabled { - vmm.device_manager.enable_pci()?; + device_manager.enable_pci()?; } else { boot_cmdline.insert("pci", "off")?; } @@ -227,53 +199,70 @@ pub fn build_microvm_for_boot( // to maintain the same MMIO address referenced in the documentation // and tests. if vm_resources.boot_timer { - vmm.device_manager.attach_boot_timer_device(request_ts)?; + device_manager.attach_boot_timer_device(request_ts)?; } if let Some(balloon) = vm_resources.balloon.get() { - attach_balloon_device(&mut vmm, &mut boot_cmdline, balloon, event_manager)?; + attach_balloon_device( + &mut device_manager, + &vm, + &mut boot_cmdline, + balloon, + event_manager, + )?; } attach_block_devices( - &mut vmm, + &mut device_manager, + &vm, &mut boot_cmdline, vm_resources.block.devices.iter(), event_manager, )?; attach_net_devices( - &mut vmm, + &mut device_manager, + &vm, &mut boot_cmdline, vm_resources.net_builder.iter(), event_manager, )?; if let Some(unix_vsock) = vm_resources.vsock.get() { - attach_unixsock_vsock_device(&mut vmm, &mut boot_cmdline, unix_vsock, event_manager)?; + attach_unixsock_vsock_device( + &mut device_manager, + &vm, + &mut boot_cmdline, + unix_vsock, + event_manager, + )?; } if let Some(entropy) = vm_resources.entropy.get() { - attach_entropy_device(&mut vmm, &mut boot_cmdline, entropy, event_manager)?; + attach_entropy_device( + &mut device_manager, + &vm, + &mut boot_cmdline, + entropy, + event_manager, + )?; } #[cfg(target_arch = "aarch64")] - vmm.device_manager.attach_legacy_devices_aarch64( - vmm.vm.fd(), - event_manager, - &mut boot_cmdline, - )?; + device_manager.attach_legacy_devices_aarch64(vm.fd(), event_manager, &mut boot_cmdline)?; - vmm.device_manager - .attach_vmgenid_device(vmm.vm.guest_memory(), vmm.vm.fd())?; + device_manager.attach_vmgenid_device(vm.guest_memory(), vm.fd())?; #[cfg(target_arch = "aarch64")] if vcpus[0].kvm_vcpu.supports_pvtime() { - setup_pvtime(&mut vmm, &mut vcpus)?; + setup_pvtime(&mut device_manager, &mut vcpus)?; } else { log::warn!("Vcpus do not support pvtime, steal time will not be reported to guest"); } configure_system_for_boot( - &mut vmm, + &kvm, + &vm, + &mut device_manager, vcpus.as_mut(), &vm_resources.machine_config, &cpu_template, @@ -282,6 +271,18 @@ pub fn build_microvm_for_boot( boot_cmdline, )?; + let vmm = Vmm { + events_observer: Some(std::io::stdin()), + instance_info: instance_info.clone(), + shutdown_exit_code: None, + kvm, + vm: Arc::new(vm), + uffd: None, + vcpus_handles: Vec::new(), + vcpus_exit_evt, + device_manager, + }; + let vmm = Arc::new(Mutex::new(vmm)); #[cfg(feature = "gdb")] @@ -293,7 +294,7 @@ pub fn build_microvm_for_boot( entry_point.entry_addr, gdb_socket_path, ) - .map_err(GdbServer)?; + .map_err(StartMicrovmError::GdbServer)?; } else { debug!("No GDB socket provided not starting gdb server."); } @@ -305,7 +306,7 @@ pub fn build_microvm_for_boot( vcpus, seccomp_filters .get("vcpu") - .ok_or_else(|| MissingSeccompFilters("vcpu".to_string()))? + .ok_or_else(|| StartMicrovmError::MissingSeccompFilters("vcpu".to_string()))? .clone(), ) .map_err(VmmError::VcpuStart)?; @@ -317,7 +318,7 @@ pub fn build_microvm_for_boot( crate::seccomp::apply_filter( seccomp_filters .get("vmm") - .ok_or_else(|| MissingSeccompFilters("vmm".to_string()))?, + .ok_or_else(|| StartMicrovmError::MissingSeccompFilters("vmm".to_string()))?, ) .map_err(VmmError::SeccompFilters)?; @@ -402,19 +403,21 @@ pub fn build_microvm_from_snapshot( ) -> Result>, BuildMicrovmFromSnapshotError> { // Build Vmm. debug!("event_start: build microvm from snapshot"); - let (mut vmm, mut vcpus) = create_vmm_and_vcpus( - instance_info, - event_manager, - vm_resources.machine_config.vcpu_count, - microvm_state.kvm_state.kvm_cap_modifiers.clone(), - ) - .map_err(StartMicrovmError::Internal)?; - vmm.vm - .register_memory_regions(guest_memory) - .map_err(VmmError::Vm) - .map_err(StartMicrovmError::Internal)?; - vmm.uffd = uffd; + let kvm = Kvm::new(microvm_state.kvm_state.kvm_cap_modifiers.clone()) + .map_err(StartMicrovmError::Kvm)?; + // Set up Kvm Vm and register memory regions. + // Build custom CPU config if a custom template is provided. + let mut vm = Vm::new(&kvm).map_err(StartMicrovmError::Vm)?; + + let (mut vcpus, vcpus_exit_evt) = vm + .create_vcpus(vm_resources.machine_config.vcpu_count) + .map_err(StartMicrovmError::Vm)?; + + let mut device_manager = DeviceManager::new(event_manager, &vcpus_exit_evt, vm.fd()).unwrap(); + + vm.register_memory_regions(guest_memory) + .map_err(StartMicrovmError::Vm)?; #[cfg(target_arch = "x86_64")] { @@ -434,7 +437,7 @@ pub fn build_microvm_from_snapshot( #[cfg(target_arch = "aarch64")] if let Some(pvtime_ipa) = vcpus[0].kvm_vcpu.pvtime_ipa { allocate_pvtime_region( - &mut vmm, + &mut device_manager, vcpus.len(), vm_allocator::AllocPolicy::ExactMatch(pvtime_ipa.0), )?; @@ -452,28 +455,39 @@ pub fn build_microvm_from_snapshot( { let mpidrs = construct_kvm_mpidrs(µvm_state.vcpu_states); // Restore kvm vm state. - vmm.vm.restore_state(&mpidrs, µvm_state.vm_state)?; + vm.restore_state(&mpidrs, µvm_state.vm_state)?; } // Restore kvm vm state. #[cfg(target_arch = "x86_64")] - vmm.vm.restore_state(µvm_state.vm_state)?; + vm.restore_state(µvm_state.vm_state)?; // Restore the boot source config paths. vm_resources.boot_source.config = microvm_state.vm_info.boot_source; // Restore devices states. let device_ctor_args = DeviceRestoreArgs { - mem: vmm.vm.guest_memory(), - vm: vmm.vm.fd(), + mem: vm.guest_memory(), + vm: vm.fd(), event_manager, vm_resources, instance_id: &instance_info.id, - restored_from_file: vmm.uffd.is_none(), + restored_from_file: uffd.is_none(), }; - vmm.device_manager - .restore(µvm_state.device_states, device_ctor_args)?; + device_manager.restore(µvm_state.device_states, device_ctor_args)?; + + let mut vmm = Vmm { + events_observer: Some(std::io::stdin()), + instance_info: instance_info.clone(), + shutdown_exit_code: None, + kvm, + vm: Arc::new(vm), + uffd, + vcpus_handles: Vec::new(), + vcpus_exit_evt, + device_manager, + }; // Move vcpus to their own threads and start their state machine in the 'Paused' state. vmm.start_vcpus( @@ -506,13 +520,12 @@ const STEALTIME_STRUCT_MEM_SIZE: u64 = 64; /// Helper method to allocate steal time region #[cfg(target_arch = "aarch64")] fn allocate_pvtime_region( - vmm: &mut Vmm, + device_manager: &mut DeviceManager, vcpu_count: usize, policy: vm_allocator::AllocPolicy, ) -> Result { let size = STEALTIME_STRUCT_MEM_SIZE * vcpu_count as u64; - let addr = vmm - .device_manager + let addr = device_manager .resource_allocator .allocate_system_memory(size, STEALTIME_STRUCT_MEM_SIZE, policy) .map_err(StartMicrovmError::AllocateResources)?; @@ -521,10 +534,16 @@ fn allocate_pvtime_region( /// Sets up pvtime for all vcpus #[cfg(target_arch = "aarch64")] -fn setup_pvtime(vmm: &mut Vmm, vcpus: &mut [Vcpu]) -> Result<(), StartMicrovmError> { +fn setup_pvtime( + device_manager: &mut DeviceManager, + vcpus: &mut [Vcpu], +) -> Result<(), StartMicrovmError> { // Alloc sys mem for steal time region - let pvtime_mem: GuestAddress = - allocate_pvtime_region(vmm, vcpus.len(), vm_allocator::AllocPolicy::LastMatch)?; + let pvtime_mem: GuestAddress = allocate_pvtime_region( + device_manager, + vcpus.len(), + vm_allocator::AllocPolicy::LastMatch, + )?; // Register all vcpus with pvtime device for (i, vcpu) in vcpus.iter_mut().enumerate() { @@ -539,7 +558,8 @@ fn setup_pvtime(vmm: &mut Vmm, vcpus: &mut [Vcpu]) -> Result<(), StartMicrovmErr } fn attach_entropy_device( - vmm: &mut Vmm, + device_manager: &mut DeviceManager, + vm: &Vm, cmdline: &mut LoaderKernelCmdline, entropy_device: &Arc>, event_manager: &mut EventManager, @@ -551,9 +571,9 @@ fn attach_entropy_device( .to_string(); event_manager.add_subscriber(entropy_device.clone()); - vmm.device_manager.attach_virtio_device( - vmm.vm.guest_memory(), - vmm.vm.fd(), + device_manager.attach_virtio_device( + vm.guest_memory(), + vm.fd(), id, entropy_device.clone(), cmdline, @@ -562,7 +582,8 @@ fn attach_entropy_device( } fn attach_block_devices<'a, I: Iterator>> + Debug>( - vmm: &mut Vmm, + device_manager: &mut DeviceManager, + vm: &Vm, cmdline: &mut LoaderKernelCmdline, blocks: I, event_manager: &mut EventManager, @@ -584,9 +605,9 @@ fn attach_block_devices<'a, I: Iterator>> + Debug>( }; // The device mutex mustn't be locked here otherwise it will deadlock. event_manager.add_subscriber(block.clone()); - vmm.device_manager.attach_virtio_device( - vmm.vm.guest_memory(), - vmm.vm.fd(), + device_manager.attach_virtio_device( + vm.guest_memory(), + vm.fd(), id, block.clone(), cmdline, @@ -597,7 +618,8 @@ fn attach_block_devices<'a, I: Iterator>> + Debug>( } fn attach_net_devices<'a, I: Iterator>> + Debug>( - vmm: &mut Vmm, + device_manager: &mut DeviceManager, + vm: &Vm, cmdline: &mut LoaderKernelCmdline, net_devices: I, event_manager: &mut EventManager, @@ -606,9 +628,9 @@ fn attach_net_devices<'a, I: Iterator>> + Debug>( let id = net_device.lock().expect("Poisoned lock").id().clone(); event_manager.add_subscriber(net_device.clone()); // The device mutex mustn't be locked here otherwise it will deadlock. - vmm.device_manager.attach_virtio_device( - vmm.vm.guest_memory(), - vmm.vm.fd(), + device_manager.attach_virtio_device( + vm.guest_memory(), + vm.fd(), id, net_device.clone(), cmdline, @@ -619,7 +641,8 @@ fn attach_net_devices<'a, I: Iterator>> + Debug>( } fn attach_unixsock_vsock_device( - vmm: &mut Vmm, + device_manager: &mut DeviceManager, + vm: &Vm, cmdline: &mut LoaderKernelCmdline, unix_vsock: &Arc>>, event_manager: &mut EventManager, @@ -627,9 +650,9 @@ fn attach_unixsock_vsock_device( let id = String::from(unix_vsock.lock().expect("Poisoned lock").id()); event_manager.add_subscriber(unix_vsock.clone()); // The device mutex mustn't be locked here otherwise it will deadlock. - vmm.device_manager.attach_virtio_device( - vmm.vm.guest_memory(), - vmm.vm.fd(), + device_manager.attach_virtio_device( + vm.guest_memory(), + vm.fd(), id, unix_vsock.clone(), cmdline, @@ -638,7 +661,8 @@ fn attach_unixsock_vsock_device( } fn attach_balloon_device( - vmm: &mut Vmm, + device_manager: &mut DeviceManager, + vm: &Vm, cmdline: &mut LoaderKernelCmdline, balloon: &Arc>, event_manager: &mut EventManager, @@ -646,9 +670,9 @@ fn attach_balloon_device( let id = String::from(balloon.lock().expect("Poisoned lock").id()); event_manager.add_subscriber(balloon.clone()); // The device mutex mustn't be locked here otherwise it will deadlock. - vmm.device_manager.attach_virtio_device( - vmm.vm.guest_memory(), - vmm.vm.fd(), + device_manager.attach_virtio_device( + vm.guest_memory(), + vm.fd(), id, balloon.clone(), cmdline, @@ -743,7 +767,7 @@ pub(crate) mod tests { instance_info: InstanceInfo::default(), shutdown_exit_code: None, kvm, - vm, + vm: Arc::new(vm), uffd: None, vcpus_handles: Vec::new(), vcpus_exit_evt, @@ -788,7 +812,8 @@ pub(crate) mod tests { } attach_block_devices( - vmm, + &mut vmm.device_manager, + &vmm.vm, cmdline, block_dev_configs.devices.iter(), event_manager, @@ -806,7 +831,13 @@ pub(crate) mod tests { let mut net_builder = NetBuilder::new(); net_builder.build(net_config).unwrap(); - let res = attach_net_devices(vmm, cmdline, net_builder.iter(), event_manager); + let res = attach_net_devices( + &mut vmm.device_manager, + &vmm.vm, + cmdline, + net_builder.iter(), + event_manager, + ); res.unwrap(); } @@ -827,7 +858,14 @@ pub(crate) mod tests { Arc::new(Mutex::new(mmds)), ); - attach_net_devices(vmm, cmdline, net_builder.iter(), event_manager).unwrap(); + attach_net_devices( + &mut vmm.device_manager, + &vmm.vm, + cmdline, + net_builder.iter(), + event_manager, + ) + .unwrap(); } pub(crate) fn insert_vsock_device( @@ -840,7 +878,14 @@ pub(crate) mod tests { let vsock = VsockBuilder::create_unixsock_vsock(vsock_config).unwrap(); let vsock = Arc::new(Mutex::new(vsock)); - attach_unixsock_vsock_device(vmm, cmdline, &vsock, event_manager).unwrap(); + attach_unixsock_vsock_device( + &mut vmm.device_manager, + &vmm.vm, + cmdline, + &vsock, + event_manager, + ) + .unwrap(); assert!( vmm.device_manager @@ -859,7 +904,14 @@ pub(crate) mod tests { let mut builder = EntropyDeviceBuilder::new(); let entropy = builder.build(entropy_config).unwrap(); - attach_entropy_device(vmm, cmdline, &entropy, event_manager).unwrap(); + attach_entropy_device( + &mut vmm.device_manager, + &vmm.vm, + cmdline, + &entropy, + event_manager, + ) + .unwrap(); assert!( vmm.device_manager @@ -887,7 +939,14 @@ pub(crate) mod tests { builder.set(balloon_config).unwrap(); let balloon = builder.get().unwrap(); - attach_balloon_device(vmm, cmdline, balloon, event_manager).unwrap(); + attach_balloon_device( + &mut vmm.device_manager, + &vmm.vm, + cmdline, + balloon, + event_manager, + ) + .unwrap(); assert!( vmm.device_manager diff --git a/src/vmm/src/device_manager/acpi.rs b/src/vmm/src/device_manager/acpi.rs index 78f1254d2fa..8a447c4c065 100644 --- a/src/vmm/src/device_manager/acpi.rs +++ b/src/vmm/src/device_manager/acpi.rs @@ -6,7 +6,7 @@ use kvm_ioctls::VmFd; use crate::devices::acpi::vmgenid::VmGenId; -#[derive(Debug)] +#[derive(Debug, Default)] pub struct ACPIDeviceManager { /// VMGenID device pub vmgenid: Option, @@ -15,7 +15,7 @@ pub struct ACPIDeviceManager { impl ACPIDeviceManager { /// Create a new ACPIDeviceManager object pub fn new() -> Self { - Self { vmgenid: None } + Default::default() } /// Attach a new VMGenID device to the microVM diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 1fd21195803..f2baaee30c4 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -127,7 +127,7 @@ pub struct MMIODevice { } /// Manages the complexities of registering a MMIO device. -#[derive(Debug)] +#[derive(Debug, Default)] pub struct MMIODeviceManager { /// VirtIO devices using an MMIO transport layer pub(crate) virtio_devices: HashMap<(u32, String), MMIODevice>, @@ -153,16 +153,7 @@ pub struct MMIODeviceManager { impl MMIODeviceManager { /// Create a new DeviceManager handling mmio devices (virtio net, block). pub fn new() -> MMIODeviceManager { - MMIODeviceManager { - virtio_devices: HashMap::new(), - boot_timer: None, - #[cfg(target_arch = "aarch64")] - rtc: None, - #[cfg(target_arch = "aarch64")] - serial: None, - #[cfg(target_arch = "x86_64")] - dsdt_data: vec![], - } + Default::default() } /// Allocates resources for a new device to be added. diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index 2922060bb13..5457b22e39d 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -294,6 +294,18 @@ pub struct DeviceRestoreArgs<'a> { pub restored_from_file: bool, } +impl std::fmt::Debug for DeviceRestoreArgs<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("DeviceRestoreArgs") + .field("mem", &self.mem) + .field("vm", &self.vm) + .field("vm_resources", &self.vm_resources) + .field("instance_id", &self.instance_id) + .field("restored_from_file", &self.restored_from_file) + .finish() + } +} + impl DeviceManager { pub fn save(&self) -> DevicesState { DevicesState { diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index e0b1baf381e..6dfc91edf1f 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -221,6 +221,7 @@ pub struct ACPIDeviceManagerState { vmgenid: Option, } +#[derive(Debug)] pub struct ACPIDeviceManagerConstructorArgs<'a> { pub mem: &'a GuestMemoryMmap, pub resource_allocator: &'a ResourceAllocator, diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 8f19e780766..de99a4593af 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -299,8 +299,9 @@ pub struct Vmm { // Guest VM core resources. kvm: Kvm, /// VM object - pub vm: Vm, + pub vm: Arc, // Save UFFD in order to keep it open in the Firecracker process, as well. + #[allow(unused)] uffd: Option, vcpus_handles: Vec, // Used by Vcpus and devices to initiate teardown; Vmm should never write here. From f6b5782317c0c78ac6f98b7aede969424d283623 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Mon, 2 Jun 2025 19:15:23 +0200 Subject: [PATCH 35/99] vm: track device interrupts within Vm object Add logic to track the device interrupts used by the microVM. This is not strictly needed right now, but we will need it when adding support for MSI-X interrupts. MSI-X interrupts are configured at runtime and we need to interact with KVM to set the interruput routes. To do it, we need to keep track all of the interrupts the VM is using. Signed-off-by: Babis Chalios --- src/vmm/src/arch/aarch64/fdt.rs | 27 +++++------ src/vmm/src/builder.rs | 57 +++++----------------- src/vmm/src/device_manager/acpi.rs | 10 ++-- src/vmm/src/device_manager/legacy.rs | 15 +++--- src/vmm/src/device_manager/mmio.rs | 28 ++++++----- src/vmm/src/device_manager/mod.rs | 29 ++++++----- src/vmm/src/device_manager/persist.rs | 9 ++-- src/vmm/src/vstate/vm.rs | 70 ++++++++++++++++++++++++++- 8 files changed, 134 insertions(+), 111 deletions(-) diff --git a/src/vmm/src/arch/aarch64/fdt.rs b/src/vmm/src/arch/aarch64/fdt.rs index c642373a016..e22bda5583e 100644 --- a/src/vmm/src/arch/aarch64/fdt.rs +++ b/src/vmm/src/arch/aarch64/fdt.rs @@ -499,17 +499,16 @@ mod tests { use std::ffi::CString; use std::sync::{Arc, Mutex}; - use kvm_ioctls::Kvm; use linux_loader::cmdline as kernel_cmdline; use super::*; - use crate::EventManager; use crate::arch::aarch64::gic::create_gic; use crate::arch::aarch64::layout; use crate::device_manager::mmio::tests::DummyDevice; use crate::device_manager::tests::default_device_manager; use crate::test_utils::arch_mem; use crate::vstate::memory::GuestAddress; + use crate::{EventManager, Kvm, Vm}; // The `load` function from the `device_tree` will mistakenly check the actual size // of the buffer with the allocated size. This works around that. @@ -525,9 +524,9 @@ mod tests { let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); let mut event_manager = EventManager::new().unwrap(); let mut device_manager = default_device_manager(); - let kvm = Kvm::new().unwrap(); - let vm = kvm.create_vm().unwrap(); - let gic = create_gic(&vm, 1, None).unwrap(); + let kvm = Kvm::new(vec![]).unwrap(); + let vm = Vm::new(&kvm).unwrap(); + let gic = create_gic(vm.fd(), 1, None).unwrap(); let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); cmdline.insert("console", "/dev/tty0").unwrap(); @@ -562,9 +561,9 @@ mod tests { fn test_create_fdt_with_vmgenid() { let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); let mut device_manager = default_device_manager(); - let kvm = Kvm::new().unwrap(); - let vm = kvm.create_vm().unwrap(); - let gic = create_gic(&vm, 1, None).unwrap(); + let kvm = Kvm::new(vec![]).unwrap(); + let vm = Vm::new(&kvm).unwrap(); + let gic = create_gic(vm.fd(), 1, None).unwrap(); let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); cmdline.insert("console", "/dev/tty0").unwrap(); @@ -585,9 +584,9 @@ mod tests { fn test_create_fdt() { let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); let device_manager = default_device_manager(); - let kvm = Kvm::new().unwrap(); - let vm = kvm.create_vm().unwrap(); - let gic = create_gic(&vm, 1, None).unwrap(); + let kvm = Kvm::new(vec![]).unwrap(); + let vm = Vm::new(&kvm).unwrap(); + let gic = create_gic(vm.fd(), 1, None).unwrap(); let saved_dtb_bytes = match gic.fdt_compatibility() { "arm,gic-v3" => include_bytes!("output_GICv3.dtb"), @@ -642,9 +641,9 @@ mod tests { fn test_create_fdt_with_initrd() { let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); let device_manager = default_device_manager(); - let kvm = Kvm::new().unwrap(); - let vm = kvm.create_vm().unwrap(); - let gic = create_gic(&vm, 1, None).unwrap(); + let kvm = Kvm::new(vec![]).unwrap(); + let vm = Vm::new(&kvm).unwrap(); + let gic = create_gic(vm.fd(), 1, None).unwrap(); let saved_dtb_bytes = match gic.fdt_compatibility() { "arm,gic-v3" => include_bytes!("output_initrd_GICv3.dtb"), diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 3f47b743063..9f4d82ed11a 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -172,7 +172,7 @@ pub fn build_microvm_for_boot( let (mut vcpus, vcpus_exit_evt) = vm.create_vcpus(vm_resources.machine_config.vcpu_count)?; vm.register_memory_regions(guest_memory)?; - let mut device_manager = DeviceManager::new(event_manager, &vcpus_exit_evt, vm.fd())?; + let mut device_manager = DeviceManager::new(event_manager, &vcpus_exit_evt, &vm)?; let entry_point = load_kernel(&boot_config.kernel_file, vm.guest_memory())?; let initrd = InitrdConfig::from_config(boot_config, vm.guest_memory())?; @@ -248,9 +248,9 @@ pub fn build_microvm_for_boot( } #[cfg(target_arch = "aarch64")] - device_manager.attach_legacy_devices_aarch64(vm.fd(), event_manager, &mut boot_cmdline)?; + device_manager.attach_legacy_devices_aarch64(&vm, event_manager, &mut boot_cmdline)?; - device_manager.attach_vmgenid_device(vm.guest_memory(), vm.fd())?; + device_manager.attach_vmgenid_device(vm.guest_memory(), &vm)?; #[cfg(target_arch = "aarch64")] if vcpus[0].kvm_vcpu.supports_pvtime() { @@ -414,7 +414,7 @@ pub fn build_microvm_from_snapshot( .create_vcpus(vm_resources.machine_config.vcpu_count) .map_err(StartMicrovmError::Vm)?; - let mut device_manager = DeviceManager::new(event_manager, &vcpus_exit_evt, vm.fd()).unwrap(); + let mut device_manager = DeviceManager::new(event_manager, &vcpus_exit_evt, &vm).unwrap(); vm.register_memory_regions(guest_memory) .map_err(StartMicrovmError::Vm)?; @@ -468,7 +468,7 @@ pub fn build_microvm_from_snapshot( // Restore devices states. let device_ctor_args = DeviceRestoreArgs { mem: vm.guest_memory(), - vm: vm.fd(), + vm: &vm, event_manager, vm_resources, instance_id: &instance_info.id, @@ -571,14 +571,7 @@ fn attach_entropy_device( .to_string(); event_manager.add_subscriber(entropy_device.clone()); - device_manager.attach_virtio_device( - vm.guest_memory(), - vm.fd(), - id, - entropy_device.clone(), - cmdline, - false, - ) + device_manager.attach_virtio_device(vm, id, entropy_device.clone(), cmdline, false) } fn attach_block_devices<'a, I: Iterator>> + Debug>( @@ -605,14 +598,7 @@ fn attach_block_devices<'a, I: Iterator>> + Debug>( }; // The device mutex mustn't be locked here otherwise it will deadlock. event_manager.add_subscriber(block.clone()); - device_manager.attach_virtio_device( - vm.guest_memory(), - vm.fd(), - id, - block.clone(), - cmdline, - is_vhost_user, - )?; + device_manager.attach_virtio_device(vm, id, block.clone(), cmdline, is_vhost_user)?; } Ok(()) } @@ -628,14 +614,7 @@ fn attach_net_devices<'a, I: Iterator>> + Debug>( let id = net_device.lock().expect("Poisoned lock").id().clone(); event_manager.add_subscriber(net_device.clone()); // The device mutex mustn't be locked here otherwise it will deadlock. - device_manager.attach_virtio_device( - vm.guest_memory(), - vm.fd(), - id, - net_device.clone(), - cmdline, - false, - )?; + device_manager.attach_virtio_device(vm, id, net_device.clone(), cmdline, false)?; } Ok(()) } @@ -650,14 +629,7 @@ fn attach_unixsock_vsock_device( let id = String::from(unix_vsock.lock().expect("Poisoned lock").id()); event_manager.add_subscriber(unix_vsock.clone()); // The device mutex mustn't be locked here otherwise it will deadlock. - device_manager.attach_virtio_device( - vm.guest_memory(), - vm.fd(), - id, - unix_vsock.clone(), - cmdline, - false, - ) + device_manager.attach_virtio_device(vm, id, unix_vsock.clone(), cmdline, false) } fn attach_balloon_device( @@ -670,14 +642,7 @@ fn attach_balloon_device( let id = String::from(balloon.lock().expect("Poisoned lock").id()); event_manager.add_subscriber(balloon.clone()); // The device mutex mustn't be locked here otherwise it will deadlock. - device_manager.attach_virtio_device( - vm.guest_memory(), - vm.fd(), - id, - balloon.clone(), - cmdline, - false, - ) + device_manager.attach_virtio_device(vm, id, balloon.clone(), cmdline, false) } #[cfg(test)] @@ -924,7 +889,7 @@ pub(crate) mod tests { #[cfg(target_arch = "x86_64")] pub(crate) fn insert_vmgenid_device(vmm: &mut Vmm) { vmm.device_manager - .attach_vmgenid_device(vmm.vm.guest_memory(), vmm.vm.fd()) + .attach_vmgenid_device(vmm.vm.guest_memory(), &vmm.vm) .unwrap(); assert!(vmm.device_manager.acpi_devices.vmgenid.is_some()); } diff --git a/src/vmm/src/device_manager/acpi.rs b/src/vmm/src/device_manager/acpi.rs index 8a447c4c065..3f0af80c7aa 100644 --- a/src/vmm/src/device_manager/acpi.rs +++ b/src/vmm/src/device_manager/acpi.rs @@ -2,8 +2,8 @@ // SPDX-License-Identifier: Apache-2.0 use acpi_tables::{Aml, aml}; -use kvm_ioctls::VmFd; +use crate::Vm; use crate::devices::acpi::vmgenid::VmGenId; #[derive(Debug, Default)] @@ -21,12 +21,8 @@ impl ACPIDeviceManager { /// Attach a new VMGenID device to the microVM /// /// This will register the device's interrupt with KVM - pub fn attach_vmgenid( - &mut self, - vmgenid: VmGenId, - vm_fd: &VmFd, - ) -> Result<(), kvm_ioctls::Error> { - vm_fd.register_irqfd(&vmgenid.interrupt_evt, vmgenid.gsi)?; + pub fn attach_vmgenid(&mut self, vmgenid: VmGenId, vm: &Vm) -> Result<(), kvm_ioctls::Error> { + vm.register_irq(&vmgenid.interrupt_evt, vmgenid.gsi)?; self.vmgenid = Some(vmgenid); Ok(()) } diff --git a/src/vmm/src/device_manager/legacy.rs b/src/vmm/src/device_manager/legacy.rs index cedb7abc32c..7011ae71122 100644 --- a/src/vmm/src/device_manager/legacy.rs +++ b/src/vmm/src/device_manager/legacy.rs @@ -11,11 +11,11 @@ use std::sync::{Arc, Mutex}; use acpi_tables::aml::AmlError; use acpi_tables::{Aml, aml}; -use kvm_ioctls::VmFd; use libc::EFD_NONBLOCK; use vm_superio::Serial; use vmm_sys_util::eventfd::EventFd; +use crate::Vm; use crate::devices::legacy::serial::SerialOut; use crate::devices::legacy::{EventFdTrigger, I8042Device, SerialDevice, SerialEventsWrapper}; @@ -100,7 +100,7 @@ impl PortIODeviceManager { pub fn register_devices( &mut self, io_bus: &vm_device::Bus, - vm_fd: &VmFd, + vm: &Vm, ) -> Result<(), LegacyDeviceError> { let serial_2_4 = Arc::new(Mutex::new(SerialDevice { serial: Serial::with_events( @@ -148,18 +148,15 @@ impl PortIODeviceManager { Self::I8042_KDB_DATA_REGISTER_SIZE, )?; - vm_fd - .register_irqfd(&self.com_evt_1_3, Self::COM_EVT_1_3_GSI) + vm.register_irq(&self.com_evt_1_3, Self::COM_EVT_1_3_GSI) .map_err(|e| { LegacyDeviceError::EventFd(std::io::Error::from_raw_os_error(e.errno())) })?; - vm_fd - .register_irqfd(&self.com_evt_2_4, Self::COM_EVT_2_4_GSI) + vm.register_irq(&self.com_evt_2_4, Self::COM_EVT_2_4_GSI) .map_err(|e| { LegacyDeviceError::EventFd(std::io::Error::from_raw_os_error(e.errno())) })?; - vm_fd - .register_irqfd(&self.kbd_evt, Self::KBD_EVT_GSI) + vm.register_irq(&self.kbd_evt, Self::KBD_EVT_GSI) .map_err(|e| { LegacyDeviceError::EventFd(std::io::Error::from_raw_os_error(e.errno())) })?; @@ -264,6 +261,6 @@ mod tests { )), ) .unwrap(); - ldm.register_devices(&io_bus, vm.fd()).unwrap(); + ldm.register_devices(&io_bus, &vm).unwrap(); } } diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index f2baaee30c4..8655247fde7 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -11,7 +11,7 @@ use std::sync::{Arc, Mutex}; #[cfg(target_arch = "x86_64")] use acpi_tables::{Aml, aml}; -use kvm_ioctls::{IoEventAddress, VmFd}; +use kvm_ioctls::IoEventAddress; use linux_loader::cmdline as kernel_cmdline; #[cfg(target_arch = "x86_64")] use log::debug; @@ -20,6 +20,7 @@ use serde::{Deserialize, Serialize}; use vm_allocator::AllocPolicy; use super::resources::ResourceAllocator; +use crate::Vm; use crate::arch::BOOT_DEVICE_MEM_START; #[cfg(target_arch = "aarch64")] use crate::arch::{RTC_MEM_START, SERIAL_MEM_START}; @@ -183,7 +184,7 @@ impl MMIODeviceManager { /// Register a virtio-over-MMIO device to be used via MMIO transport at a specific slot. pub fn register_mmio_virtio( &mut self, - vm: &VmFd, + vm: &Vm, device_id: String, mmio_bus: &vm_device::Bus, device: MMIODevice, @@ -200,10 +201,11 @@ impl MMIODeviceManager { let io_addr = IoEventAddress::Mmio( device.resources.addr + u64::from(crate::devices::virtio::NOTIFY_REG_OFFSET), ); - vm.register_ioevent(queue_evt, &io_addr, u32::try_from(i).unwrap()) + vm.fd() + .register_ioevent(queue_evt, &io_addr, u32::try_from(i).unwrap()) .map_err(MmioError::RegisterIoEvent)?; } - vm.register_irqfd(&mmio_device.interrupt.irq_evt, irq) + vm.register_irq(&mmio_device.interrupt.irq_evt, irq) .map_err(MmioError::RegisterIrqFd)?; } @@ -242,7 +244,7 @@ impl MMIODeviceManager { /// to the boot cmdline. pub fn register_mmio_virtio_for_boot( &mut self, - vm: &VmFd, + vm: &Vm, resource_allocator: &ResourceAllocator, device_id: String, mmio_device: MmioTransport, @@ -274,7 +276,7 @@ impl MMIODeviceManager { /// otherwise allocate a new MMIO resources for it. pub fn register_mmio_serial( &mut self, - vm: &VmFd, + vm: &Vm, resource_allocator: &ResourceAllocator, serial: Arc>, device_info_opt: Option, @@ -292,7 +294,7 @@ impl MMIODeviceManager { } }; - vm.register_irqfd( + vm.register_irq( serial.lock().expect("Poisoned lock").serial.interrupt_evt(), device_info.irq.unwrap(), ) @@ -556,7 +558,7 @@ pub(crate) mod tests { impl MMIODeviceManager { pub(crate) fn register_virtio_test_device( &mut self, - vm: &VmFd, + vm: &Vm, guest_mem: GuestMemoryMmap, resource_allocator: &ResourceAllocator, device: Arc>, @@ -689,7 +691,7 @@ pub(crate) mod tests { device_manager .register_virtio_test_device( - vm.fd(), + &vm, vm.guest_memory().clone(), &resource_allocator, dummy, @@ -737,7 +739,7 @@ pub(crate) mod tests { for _i in crate::arch::GSI_BASE..=crate::arch::GSI_MAX { device_manager .register_virtio_test_device( - vm.fd(), + &vm, vm.guest_memory().clone(), &resource_allocator, Arc::new(Mutex::new(DummyDevice::new())), @@ -751,7 +753,7 @@ pub(crate) mod tests { "{}", device_manager .register_virtio_test_device( - vm.fd(), + &vm, vm.guest_memory().clone(), &resource_allocator, Arc::new(Mutex::new(DummyDevice::new())), @@ -796,7 +798,7 @@ pub(crate) mod tests { let id = String::from("foo"); let addr = device_manager .register_virtio_test_device( - vm.fd(), + &vm, vm.guest_memory().clone(), &resource_allocator, dummy, @@ -826,7 +828,7 @@ pub(crate) mod tests { let id2 = String::from("foo2"); device_manager .register_virtio_test_device( - vm.fd(), + &vm, vm.guest_memory().clone(), &resource_allocator, dummy2, diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index 5457b22e39d..a60a86ea7c3 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -10,7 +10,6 @@ use std::sync::{Arc, Mutex}; use acpi::ACPIDeviceManager; use event_manager::{MutEventSubscriber, SubscriberOps}; -use kvm_ioctls::VmFd; #[cfg(target_arch = "x86_64")] use legacy::{LegacyDeviceError, PortIODeviceManager}; use linux_loader::loader::Cmdline; @@ -36,7 +35,7 @@ use crate::devices::virtio::transport::mmio::{IrqTrigger, MmioTransport}; use crate::resources::VmResources; use crate::snapshot::Persist; use crate::vstate::memory::GuestMemoryMmap; -use crate::{EmulateSerialInitError, EventManager}; +use crate::{EmulateSerialInitError, EventManager, Vm}; /// ACPI device manager. pub mod acpi; @@ -143,7 +142,7 @@ impl DeviceManager { pub fn new( event_manager: &mut EventManager, vcpu_exit_evt: &EventFd, - vmfd: &VmFd, + vm: &Vm, ) -> Result { let resource_allocator = Arc::new(ResourceAllocator::new()?); #[cfg(target_arch = "x86_64")] @@ -160,7 +159,7 @@ impl DeviceManager { // create pio dev manager with legacy devices let mut legacy_devices = PortIODeviceManager::new(serial, i8042)?; - legacy_devices.register_devices(&resource_allocator.pio_bus, vmfd)?; + legacy_devices.register_devices(&resource_allocator.pio_bus, vm)?; legacy_devices }; @@ -177,8 +176,7 @@ impl DeviceManager { /// Attaches a VirtioDevice device to the device manager and event manager. pub(crate) fn attach_virtio_device( &mut self, - mem: &GuestMemoryMmap, - vmfd: &VmFd, + vm: &Vm, id: String, device: Arc>, cmdline: &mut Cmdline, @@ -186,9 +184,10 @@ impl DeviceManager { ) -> Result<(), AttachMmioDeviceError> { let interrupt = Arc::new(IrqTrigger::new()); // The device mutex mustn't be locked here otherwise it will deadlock. - let device = MmioTransport::new(mem.clone(), interrupt, device, is_vhost_user); + let device = + MmioTransport::new(vm.guest_memory().clone(), interrupt, device, is_vhost_user); self.mmio_devices.register_mmio_virtio_for_boot( - vmfd, + vm, &self.resource_allocator, id, device, @@ -214,17 +213,17 @@ impl DeviceManager { pub(crate) fn attach_vmgenid_device( &mut self, mem: &GuestMemoryMmap, - vmfd: &VmFd, + vm: &Vm, ) -> Result<(), AttachVmgenidError> { let vmgenid = VmGenId::new(mem, &self.resource_allocator)?; - self.acpi_devices.attach_vmgenid(vmgenid, vmfd)?; + self.acpi_devices.attach_vmgenid(vmgenid, vm)?; Ok(()) } #[cfg(target_arch = "aarch64")] pub(crate) fn attach_legacy_devices_aarch64( &mut self, - vmfd: &VmFd, + vm: &Vm, event_manager: &mut EventManager, cmdline: &mut Cmdline, ) -> Result<(), AttachLegacyMmioDeviceError> { @@ -241,7 +240,7 @@ impl DeviceManager { Self::set_stdout_nonblocking(); let serial = Self::setup_serial_device(event_manager)?; self.mmio_devices - .register_mmio_serial(vmfd, &self.resource_allocator, serial, None)?; + .register_mmio_serial(vm, &self.resource_allocator, serial, None)?; self.mmio_devices.add_mmio_serial_to_cmdline(cmdline)?; } @@ -287,7 +286,7 @@ pub enum DevicePersistError { pub struct DeviceRestoreArgs<'a> { pub mem: &'a GuestMemoryMmap, - pub vm: &'a VmFd, + pub vm: &'a Vm, pub event_manager: &'a mut EventManager, pub vm_resources: &'a mut VmResources, pub instance_id: &'a str, @@ -434,7 +433,7 @@ pub(crate) mod tests { let mut cmdline = Cmdline::new(4096).unwrap(); let mut event_manager = EventManager::new().unwrap(); vmm.device_manager - .attach_legacy_devices_aarch64(vmm.vm.fd(), &mut event_manager, &mut cmdline) + .attach_legacy_devices_aarch64(&vmm.vm, &mut event_manager, &mut cmdline) .unwrap(); assert!(vmm.device_manager.mmio_devices.rtc.is_some()); assert!(vmm.device_manager.mmio_devices.serial.is_none()); @@ -442,7 +441,7 @@ pub(crate) mod tests { let mut vmm = default_vmm(); cmdline.insert("console", "/dev/blah").unwrap(); vmm.device_manager - .attach_legacy_devices_aarch64(vmm.vm.fd(), &mut event_manager, &mut cmdline) + .attach_legacy_devices_aarch64(&vmm.vm, &mut event_manager, &mut cmdline) .unwrap(); assert!(vmm.device_manager.mmio_devices.rtc.is_some()); assert!(vmm.device_manager.mmio_devices.serial.is_some()); diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 6dfc91edf1f..7b9605b3e5d 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -7,7 +7,6 @@ use std::fmt::{self, Debug}; use std::sync::{Arc, Mutex}; use event_manager::{MutEventSubscriber, SubscriberOps}; -use kvm_ioctls::VmFd; use log::{error, warn}; use serde::{Deserialize, Serialize}; use vm_allocator::AllocPolicy; @@ -15,7 +14,6 @@ use vm_allocator::AllocPolicy; use super::acpi::ACPIDeviceManager; use super::mmio::*; use super::resources::ResourceAllocator; -use crate::EventManager; #[cfg(target_arch = "aarch64")] use crate::arch::DeviceType; use crate::devices::acpi::vmgenid::{VMGenIDState, VMGenIdConstructorArgs, VmGenId, VmGenIdError}; @@ -51,6 +49,7 @@ use crate::resources::{ResourcesError, VmResources}; use crate::snapshot::Persist; use crate::vmm_config::mmds::MmdsConfigError; use crate::vstate::memory::GuestMemoryMmap; +use crate::{EventManager, Vm}; /// Errors for (de)serialization of the MMIO device manager. #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -196,7 +195,7 @@ pub enum SharedDeviceType { pub struct MMIODevManagerConstructorArgs<'a> { pub mem: &'a GuestMemoryMmap, - pub vm: &'a VmFd, + pub vm: &'a Vm, pub event_manager: &'a mut EventManager, pub resource_allocator: &'a ResourceAllocator, pub vm_resources: &'a mut VmResources, @@ -225,7 +224,7 @@ pub struct ACPIDeviceManagerState { pub struct ACPIDeviceManagerConstructorArgs<'a> { pub mem: &'a GuestMemoryMmap, pub resource_allocator: &'a ResourceAllocator, - pub vm: &'a VmFd, + pub vm: &'a Vm, } #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -792,7 +791,7 @@ mod tests { let vm_resources = &mut VmResources::default(); let restore_args = MMIODevManagerConstructorArgs { mem: vmm.vm.guest_memory(), - vm: vmm.vm.fd(), + vm: &vmm.vm, event_manager: &mut event_manager, resource_allocator: &resource_allocator, vm_resources, diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index f5a5755eec9..07f0a1e787c 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -5,13 +5,20 @@ // Use of this source code is governed by a BSD-style license that can be // found in the THIRD-PARTY file. +use std::collections::HashMap; use std::fs::OpenOptions; use std::io::Write; use std::path::Path; -use std::sync::Arc; +use std::sync::{Arc, Mutex}; -use kvm_bindings::{KVM_MEM_LOG_DIRTY_PAGES, kvm_userspace_memory_region}; +#[cfg(target_arch = "x86_64")] +use kvm_bindings::KVM_IRQCHIP_IOAPIC; +use kvm_bindings::{ + KVM_IRQ_ROUTING_IRQCHIP, KVM_MEM_LOG_DIRTY_PAGES, kvm_irq_routing_entry, + kvm_userspace_memory_region, +}; use kvm_ioctls::VmFd; +use vmm_sys_util::errno; use vmm_sys_util::eventfd::EventFd; use crate::arch::host_page_size; @@ -26,6 +33,28 @@ use crate::vstate::memory::{ use crate::vstate::vcpu::VcpuError; use crate::{DirtyBitmap, Vcpu, mem_size_mib}; +#[derive(Debug, thiserror::Error, displaydoc::Display)] +/// Errors related with Firecracker interrupts +pub enum InterruptError { + /// Error allocating resources: {0} + Allocator(#[from] vm_allocator::Error), + /// EventFd error: {0} + EventFd(std::io::Error), + /// FamStruct error: {0} + FamStruct(#[from] vmm_sys_util::fam::Error), + /// KVM error: {0} + Kvm(#[from] kvm_ioctls::Error), +} + +// TODO: temporarily allow these to be unused, they will be used in the next commit +#[allow(dead_code)] +#[derive(Debug)] +/// A struct representing an interrupt line used by some device of the microVM +pub struct RoutingEntry { + entry: kvm_irq_routing_entry, + masked: bool, +} + /// Architecture independent parts of a VM. #[derive(Debug)] pub struct VmCommon { @@ -34,6 +63,8 @@ pub struct VmCommon { max_memslots: u32, /// The guest memory of this Vm. pub guest_memory: GuestMemoryMmap, + /// Interrupts used by Vm's devices + pub interrupts: Mutex>, } /// Errors associated with the wrappers over KVM ioctls. @@ -105,6 +136,7 @@ impl Vm { fd, max_memslots: kvm.max_nr_memslots(), guest_memory: GuestMemoryMmap::default(), + interrupts: Mutex::new(HashMap::new()), }) } @@ -284,6 +316,40 @@ impl Vm { file.sync_all() .map_err(|err| MemoryBackingFile("sync_all", err)) } + + /// Register a device IRQ + pub fn register_irq(&self, fd: &EventFd, gsi: u32) -> Result<(), errno::Error> { + self.common.fd.register_irqfd(fd, gsi)?; + + let mut entry = kvm_irq_routing_entry { + gsi, + type_: KVM_IRQ_ROUTING_IRQCHIP, + ..Default::default() + }; + #[cfg(target_arch = "x86_64")] + { + entry.u.irqchip.irqchip = KVM_IRQCHIP_IOAPIC; + } + #[cfg(target_arch = "aarch64")] + { + entry.u.irqchip.irqchip = 0; + } + + entry.u.irqchip.pin = gsi; + + self.common + .interrupts + .lock() + .expect("Poisoned lock") + .insert( + gsi, + RoutingEntry { + entry, + masked: false, + }, + ); + Ok(()) + } } /// Use `mincore(2)` to overapproximate the dirty bitmap for the given memslot. To be used From d9a458f78a68eb011480bf1c78b51d1e976f85a3 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 3 Jun 2025 12:28:39 +0200 Subject: [PATCH 36/99] interrupts: add support for MSI/MSI-X interrupts Enable Vm to vend and manage MSI/MSI-X interrupts. This adds the logic to create a set of MSI vectors and then handle their lifetime. Signed-off-by: Babis Chalios --- src/vmm/src/vstate/vm.rs | 444 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 439 insertions(+), 5 deletions(-) diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index 07f0a1e787c..eaf8b3f9a51 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -9,20 +9,26 @@ use std::collections::HashMap; use std::fs::OpenOptions; use std::io::Write; use std::path::Path; +use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::{Arc, Mutex}; #[cfg(target_arch = "x86_64")] use kvm_bindings::KVM_IRQCHIP_IOAPIC; use kvm_bindings::{ - KVM_IRQ_ROUTING_IRQCHIP, KVM_MEM_LOG_DIRTY_PAGES, kvm_irq_routing_entry, - kvm_userspace_memory_region, + KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI, KVM_MEM_LOG_DIRTY_PAGES, KVM_MSI_VALID_DEVID, + KvmIrqRouting, kvm_irq_routing_entry, kvm_userspace_memory_region, }; use kvm_ioctls::VmFd; +use log::debug; +use vm_device::interrupt::{ + InterruptIndex, InterruptSourceConfig, InterruptSourceGroup, MsiIrqSourceConfig, +}; use vmm_sys_util::errno; use vmm_sys_util::eventfd::EventFd; use crate::arch::host_page_size; pub use crate::arch::{ArchVm as Vm, ArchVmError, VmState}; +use crate::device_manager::resources::ResourceAllocator; use crate::logger::info; use crate::persist::CreateSnapshotError; use crate::utils::u64_to_usize; @@ -46,8 +52,6 @@ pub enum InterruptError { Kvm(#[from] kvm_ioctls::Error), } -// TODO: temporarily allow these to be unused, they will be used in the next commit -#[allow(dead_code)] #[derive(Debug)] /// A struct representing an interrupt line used by some device of the microVM pub struct RoutingEntry { @@ -55,6 +59,148 @@ pub struct RoutingEntry { masked: bool, } +/// Type that describes an allocated interrupt +#[derive(Debug)] +pub struct MsiVector { + /// GSI used for this vector + pub gsi: u32, + /// EventFd used for this vector + pub event_fd: EventFd, + /// Flag determining whether the vector is enabled + pub enabled: AtomicBool, +} + +impl MsiVector { + /// Create a new [`MsiVector`] of a particular type + pub fn new(gsi: u32, enabled: bool) -> Result { + Ok(MsiVector { + gsi, + event_fd: EventFd::new(libc::EFD_NONBLOCK).map_err(InterruptError::EventFd)?, + enabled: AtomicBool::new(enabled), + }) + } +} + +impl MsiVector { + /// Enable vector + fn enable(&self, vmfd: &VmFd) -> Result<(), errno::Error> { + if !self.enabled.load(Ordering::Acquire) { + vmfd.register_irqfd(&self.event_fd, self.gsi)?; + self.enabled.store(true, Ordering::Release); + } + + Ok(()) + } + + /// Disable vector + fn disable(&self, vmfd: &VmFd) -> Result<(), errno::Error> { + if self.enabled.load(Ordering::Acquire) { + vmfd.unregister_irqfd(&self.event_fd, self.gsi)?; + self.enabled.store(false, Ordering::Release); + } + + Ok(()) + } +} + +#[derive(Debug)] +/// MSI interrupts created for a VirtIO device +pub struct MsiVectorGroup { + vm: Arc, + irq_routes: HashMap, +} + +impl MsiVectorGroup { + /// Returns the number of vectors in this group + pub fn num_vectors(&self) -> u16 { + // It is safe to unwrap here. We are creating `MsiVectorGroup` objects through the + // `Vm::create_msix_group` where the argument for the number of `irq_routes` is a `u16`. + u16::try_from(self.irq_routes.len()).unwrap() + } +} + +impl InterruptSourceGroup for MsiVectorGroup { + fn enable(&self) -> vm_device::interrupt::Result<()> { + for route in self.irq_routes.values() { + route.enable(&self.vm.common.fd)?; + } + + Ok(()) + } + + fn disable(&self) -> vm_device::interrupt::Result<()> { + for route in self.irq_routes.values() { + route.disable(&self.vm.common.fd)?; + } + + Ok(()) + } + + fn trigger(&self, index: InterruptIndex) -> vm_device::interrupt::Result<()> { + self.notifier(index) + .ok_or(std::io::Error::other(format!( + "trigger: invalid interrupt index {index}" + )))? + .write(1) + } + + fn notifier(&self, index: InterruptIndex) -> Option<&EventFd> { + self.irq_routes.get(&index).map(|route| &route.event_fd) + } + + fn update( + &self, + index: InterruptIndex, + config: InterruptSourceConfig, + masked: bool, + set_gsi: bool, + ) -> vm_device::interrupt::Result<()> { + let msi_config = match config { + InterruptSourceConfig::LegacyIrq(_) => { + return Err(std::io::Error::other( + "MSI-x update: invalid configuration type", + )); + } + InterruptSourceConfig::MsiIrq(config) => config, + }; + + if let Some(route) = self.irq_routes.get(&index) { + // When an interrupt is masked the GSI will not be passed to KVM through + // KVM_SET_GSI_ROUTING. So, call [`disable()`] to unregister the interrupt file + // descriptor before passing the interrupt routes to KVM + if masked { + route.disable(&self.vm.common.fd)?; + } + + self.vm.register_msi(route, masked, msi_config)?; + if set_gsi { + self.vm + .set_gsi_routes() + .map_err(|err| std::io::Error::other(format!("MSI-X update: {err}")))? + } + + // Assign KVM_IRQFD after KVM_SET_GSI_ROUTING to avoid + // panic on kernel which does not have commit a80ced6ea514 + // (KVM: SVM: fix panic on out-of-bounds guest IRQ). + if !masked { + route.enable(&self.vm.common.fd)?; + } + + return Ok(()); + } + + Err(std::io::Error::other(format!( + "MSI-X update: invalid vector index {index}" + ))) + } + + fn set_gsi(&self) -> vm_device::interrupt::Result<()> { + self.vm + .set_gsi_routes() + .map_err(|err| std::io::Error::other(format!("MSI-X update: {err}"))) + } +} + /// Architecture independent parts of a VM. #[derive(Debug)] pub struct VmCommon { @@ -334,7 +480,6 @@ impl Vm { { entry.u.irqchip.irqchip = 0; } - entry.u.irqchip.pin = gsi; self.common @@ -350,6 +495,84 @@ impl Vm { ); Ok(()) } + + /// Register an MSI device interrupt + pub fn register_msi( + &self, + route: &MsiVector, + masked: bool, + config: MsiIrqSourceConfig, + ) -> Result<(), errno::Error> { + let mut entry = kvm_irq_routing_entry { + gsi: route.gsi, + type_: KVM_IRQ_ROUTING_MSI, + ..Default::default() + }; + entry.u.msi.address_lo = config.low_addr; + entry.u.msi.address_hi = config.high_addr; + entry.u.msi.data = config.data; + + if self.common.fd.check_extension(kvm_ioctls::Cap::MsiDevid) { + // According to KVM documentation: + // https://docs.kernel.org/virt/kvm/api.html#kvm-set-gsi-routing + // + // if the capability is set, we need to set the flag and provide a valid unique device + // ID. "For PCI, this is usually a BDF identifier in the lower 16 bits". + // + // The layout of `config.devid` is: + // + // |---- 16 bits ----|-- 8 bits --|-- 5 bits --|-- 3 bits --| + // | segment | bus | device | function | + // + // For the time being, we are using a single PCI segment and a single bus per segment + // so just passing config.devid should be fine. + entry.flags = KVM_MSI_VALID_DEVID; + entry.u.msi.__bindgen_anon_1.devid = config.devid; + } + + self.common + .interrupts + .lock() + .expect("Poisoned lock") + .insert(route.gsi, RoutingEntry { entry, masked }); + + Ok(()) + } + + /// Create a group of MSI-X interrupts + pub fn create_msix_group( + vm: Arc, + resource_allocator: &ResourceAllocator, + count: u16, + ) -> Result { + debug!("Creating new MSI group with {count} vectors"); + let mut irq_routes = HashMap::with_capacity(count as usize); + for (gsi, i) in resource_allocator + .allocate_gsi(count as u32)? + .iter() + .zip(0u32..) + { + irq_routes.insert(i, MsiVector::new(*gsi, false)?); + } + + Ok(MsiVectorGroup { vm, irq_routes }) + } + + /// Set GSI routes to KVM + pub fn set_gsi_routes(&self) -> Result<(), InterruptError> { + let entries = self.common.interrupts.lock().expect("Poisoned lock"); + let mut routes = KvmIrqRouting::new(0)?; + + for entry in entries.values() { + if entry.masked { + continue; + } + routes.push(entry.entry)?; + } + + self.common.fd.set_gsi_routing(&routes)?; + Ok(()) + } } /// Use `mincore(2)` to overapproximate the dirty bitmap for the given memslot. To be used @@ -395,6 +618,7 @@ fn mincore_bitmap(region: &GuestRegionMmap) -> Result, VmError> { #[cfg(test)] pub(crate) mod tests { + use vm_device::interrupt::{InterruptSourceConfig, LegacyIrqSourceConfig}; use vm_memory::GuestAddress; use vm_memory::mmap::MmapRegionBuilder; @@ -505,4 +729,214 @@ pub(crate) mod tests { assert_eq!(vcpu_vec.len(), vcpu_count as usize); } + + fn enable_irqchip(vm: &mut Vm) { + #[cfg(target_arch = "x86_64")] + vm.setup_irqchip().unwrap(); + #[cfg(target_arch = "aarch64")] + vm.setup_irqchip(1).unwrap(); + } + + fn create_msix_group(vm: &Arc) -> MsiVectorGroup { + let resource_allocator = ResourceAllocator::new().unwrap(); + Vm::create_msix_group(vm.clone(), &resource_allocator, 4).unwrap() + } + + #[test] + fn test_msi_vector_group_new() { + let (_, vm) = setup_vm_with_memory(mib_to_bytes(128)); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + assert_eq!(msix_group.num_vectors(), 4); + } + + #[test] + fn test_msi_vector_group_enable_disable() { + let (_, mut vm) = setup_vm_with_memory(mib_to_bytes(128)); + enable_irqchip(&mut vm); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + + // Initially all vectors are disabled + for route in msix_group.irq_routes.values() { + assert!(!route.enabled.load(Ordering::Acquire)) + } + + // Enable works + msix_group.enable().unwrap(); + for route in msix_group.irq_routes.values() { + assert!(route.enabled.load(Ordering::Acquire)); + } + // Enabling an enabled group doesn't error out + msix_group.enable().unwrap(); + + // Disable works + msix_group.disable().unwrap(); + for route in msix_group.irq_routes.values() { + assert!(!route.enabled.load(Ordering::Acquire)) + } + // Disabling a disabled group doesn't error out + } + + #[test] + fn test_msi_vector_group_trigger() { + let (_, mut vm) = setup_vm_with_memory(mib_to_bytes(128)); + enable_irqchip(&mut vm); + + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + + // We can now trigger all vectors + for i in 0..4 { + msix_group.trigger(i).unwrap() + } + + // We can't trigger an invalid vector + msix_group.trigger(4).unwrap_err(); + } + + #[test] + fn test_msi_vector_group_notifier() { + let (_, vm) = setup_vm_with_memory(mib_to_bytes(128)); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + + for i in 0..4 { + assert!(msix_group.notifier(i).is_some()); + } + + assert!(msix_group.notifier(4).is_none()); + } + + #[test] + fn test_msi_vector_group_update_wrong_config() { + let (_, vm) = setup_vm_with_memory(mib_to_bytes(128)); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + let irq_config = LegacyIrqSourceConfig { irqchip: 0, pin: 0 }; + msix_group + .update(0, InterruptSourceConfig::LegacyIrq(irq_config), true, true) + .unwrap_err(); + } + + #[test] + fn test_msi_vector_group_update_invalid_vector() { + let (_, mut vm) = setup_vm_with_memory(mib_to_bytes(128)); + enable_irqchip(&mut vm); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + let config = InterruptSourceConfig::MsiIrq(MsiIrqSourceConfig { + high_addr: 0x42, + low_addr: 0x12, + data: 0x12, + devid: 0xafa, + }); + msix_group.update(0, config, true, true).unwrap(); + msix_group.update(4, config, true, true).unwrap_err(); + } + + #[test] + fn test_msi_vector_group_update() { + let (_, mut vm) = setup_vm_with_memory(mib_to_bytes(128)); + enable_irqchip(&mut vm); + let vm = Arc::new(vm); + assert!(vm.common.interrupts.lock().unwrap().is_empty()); + let msix_group = create_msix_group(&vm); + + // Set some configuration for the vectors. Initially all are masked + let mut config = MsiIrqSourceConfig { + high_addr: 0x42, + low_addr: 0x13, + data: 0x12, + devid: 0xafa, + }; + for i in 0..4 { + config.data = 0x12 * i; + msix_group + .update(i, InterruptSourceConfig::MsiIrq(config), true, false) + .unwrap(); + } + + // All vectors should be disabled + for vector in msix_group.irq_routes.values() { + assert!(!vector.enabled.load(Ordering::Acquire)); + } + + for i in 0..4 { + let gsi = crate::arch::GSI_BASE + i; + let interrupts = vm.common.interrupts.lock().unwrap(); + let kvm_route = interrupts.get(&gsi).unwrap(); + assert!(kvm_route.masked); + assert_eq!(kvm_route.entry.gsi, gsi); + assert_eq!(kvm_route.entry.type_, KVM_IRQ_ROUTING_MSI); + // SAFETY: because we know we setup MSI routes. + unsafe { + assert_eq!(kvm_route.entry.u.msi.address_hi, 0x42); + assert_eq!(kvm_route.entry.u.msi.address_lo, 0x13); + assert_eq!(kvm_route.entry.u.msi.data, 0x12 * i); + } + } + + // Simply enabling the vectors should not update the registered IRQ routes + msix_group.enable().unwrap(); + for i in 0..4 { + let gsi = crate::arch::GSI_BASE + i; + let interrupts = vm.common.interrupts.lock().unwrap(); + let kvm_route = interrupts.get(&gsi).unwrap(); + assert!(kvm_route.masked); + assert_eq!(kvm_route.entry.gsi, gsi); + assert_eq!(kvm_route.entry.type_, KVM_IRQ_ROUTING_MSI); + // SAFETY: because we know we setup MSI routes. + unsafe { + assert_eq!(kvm_route.entry.u.msi.address_hi, 0x42); + assert_eq!(kvm_route.entry.u.msi.address_lo, 0x13); + assert_eq!(kvm_route.entry.u.msi.data, 0x12 * i); + } + } + + // Updating the config of a vector should enable its route (and only its route) + config.data = 0; + msix_group + .update(0, InterruptSourceConfig::MsiIrq(config), false, true) + .unwrap(); + for i in 0..4 { + let gsi = crate::arch::GSI_BASE + i; + let interrupts = vm.common.interrupts.lock().unwrap(); + let kvm_route = interrupts.get(&gsi).unwrap(); + assert_eq!(kvm_route.masked, i != 0); + assert_eq!(kvm_route.entry.gsi, gsi); + assert_eq!(kvm_route.entry.type_, KVM_IRQ_ROUTING_MSI); + // SAFETY: because we know we setup MSI routes. + unsafe { + assert_eq!(kvm_route.entry.u.msi.address_hi, 0x42); + assert_eq!(kvm_route.entry.u.msi.address_lo, 0x13); + assert_eq!(kvm_route.entry.u.msi.data, 0x12 * i); + } + } + } + + #[cfg(target_arch = "x86_64")] + #[test] + fn test_msi_vector_group_set_gsi_without_ioapic() { + // Setting GSI routes without IOAPIC setup should fail on x86. Apparently, it doesn't fail + // on Aarch64 + let (_, vm) = setup_vm_with_memory(mib_to_bytes(128)); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + let err = msix_group.set_gsi().unwrap_err(); + assert_eq!( + format!("{err}"), + "MSI-X update: KVM error: Invalid argument (os error 22)" + ); + } + + #[test] + fn test_msi_vector_group_set_gsi() { + let (_, mut vm) = setup_vm_with_memory(mib_to_bytes(128)); + enable_irqchip(&mut vm); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + + msix_group.set_gsi().unwrap(); + } } From 3341ac3322db86a061467ea2bb77e34e8eb03475 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 10 Jun 2025 18:46:23 +0200 Subject: [PATCH 37/99] vstate: support serializing interrupts to snapshots Vm object is now maintaining information about the interrupts (both traditional IRQs and MSI-X vectors) that are being used by microVM devices. Derive Serialize/Deserialize add logic for recreating objects for relevant types. Signed-off-by: Babis Chalios --- src/vmm/src/vstate/vm.rs | 58 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index eaf8b3f9a51..6d5a86f76ed 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -20,6 +20,7 @@ use kvm_bindings::{ }; use kvm_ioctls::VmFd; use log::debug; +use serde::{Deserialize, Serialize}; use vm_device::interrupt::{ InterruptIndex, InterruptSourceConfig, InterruptSourceGroup, MsiIrqSourceConfig, }; @@ -31,6 +32,7 @@ pub use crate::arch::{ArchVm as Vm, ArchVmError, VmState}; use crate::device_manager::resources::ResourceAllocator; use crate::logger::info; use crate::persist::CreateSnapshotError; +use crate::snapshot::Persist; use crate::utils::u64_to_usize; use crate::vmm_config::snapshot::SnapshotType; use crate::vstate::memory::{ @@ -52,7 +54,7 @@ pub enum InterruptError { Kvm(#[from] kvm_ioctls::Error), } -#[derive(Debug)] +#[derive(Debug, Serialize, Deserialize)] /// A struct representing an interrupt line used by some device of the microVM pub struct RoutingEntry { entry: kvm_irq_routing_entry, @@ -119,6 +121,38 @@ impl MsiVectorGroup { } } +impl<'a> Persist<'a> for MsiVectorGroup { + type State = HashMap; + type ConstructorArgs = Arc; + type Error = InterruptError; + + fn save(&self) -> Self::State { + // We don't save the "enabled" state of the MSI interrupt. PCI devices store the MSI-X + // configuration and make sure that the vector is enabled during the restore path if it was + // initially enabled + self.irq_routes + .iter() + .map(|(id, route)| (*id, route.gsi)) + .collect() + } + + fn restore( + constructor_args: Self::ConstructorArgs, + state: &Self::State, + ) -> std::result::Result { + let mut irq_routes = HashMap::new(); + + for (id, gsi) in state { + irq_routes.insert(*id, MsiVector::new(*gsi, false)?); + } + + Ok(MsiVectorGroup { + vm: constructor_args, + irq_routes, + }) + } +} + impl InterruptSourceGroup for MsiVectorGroup { fn enable(&self) -> vm_device::interrupt::Result<()> { for route in self.irq_routes.values() { @@ -939,4 +973,26 @@ pub(crate) mod tests { msix_group.set_gsi().unwrap(); } + + #[test] + fn test_msi_vector_group_persistence() { + let (_, mut vm) = setup_vm_with_memory(mib_to_bytes(128)); + enable_irqchip(&mut vm); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + + msix_group.enable().unwrap(); + let state = msix_group.save(); + let restored_group = MsiVectorGroup::restore(vm, &state).unwrap(); + + assert_eq!(msix_group.num_vectors(), restored_group.num_vectors()); + // Even if an MSI group is enabled, we don't save it as such. During restoration, the PCI + // transport will make sure the correct config is set for the vectors and enable them + // accordingly. + for (id, vector) in msix_group.irq_routes { + let new_vector = restored_group.irq_routes.get(&id).unwrap(); + assert_eq!(vector.gsi, new_vector.gsi); + assert!(!new_vector.enabled.load(Ordering::Acquire)); + } + } } From 509f40acf3b27e2d8bae9950c5198ec91f9825af Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 3 Jun 2025 18:10:50 +0200 Subject: [PATCH 38/99] virtio: initialize queue size with max_size Apparently, PCI needs Queue::size to be initialized to the maximum possible size supported by the device, otherwise initialization fails. Signed-off-by: Babis Chalios --- src/vmm/src/devices/virtio/queue.rs | 2 +- src/vmm/src/devices/virtio/transport/mmio.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/vmm/src/devices/virtio/queue.rs b/src/vmm/src/devices/virtio/queue.rs index 9977070293e..79c635e5c4d 100644 --- a/src/vmm/src/devices/virtio/queue.rs +++ b/src/vmm/src/devices/virtio/queue.rs @@ -280,7 +280,7 @@ impl Queue { pub fn new(max_size: u16) -> Queue { Queue { max_size, - size: 0, + size: max_size, ready: false, desc_table_address: GuestAddress(0), avail_ring_address: GuestAddress(0), diff --git a/src/vmm/src/devices/virtio/transport/mmio.rs b/src/vmm/src/devices/virtio/transport/mmio.rs index 5ecc3fa8ffe..3a8aa1ad42e 100644 --- a/src/vmm/src/devices/virtio/transport/mmio.rs +++ b/src/vmm/src/devices/virtio/transport/mmio.rs @@ -787,7 +787,7 @@ pub(crate) mod tests { assert_eq!(d.queue_select, 3); d.queue_select = 0; - assert_eq!(d.locked_device().queues()[0].size, 0); + assert_eq!(d.locked_device().queues()[0].size, 16); write_le_u32(&mut buf[..], 16); d.write(0x0, 0x38, &buf[..]); assert_eq!(d.locked_device().queues()[0].size, 16); From 8369c7ad691488b4921e4f0a312a7937ca05f810 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 3 Jun 2025 18:13:08 +0200 Subject: [PATCH 39/99] acpi: PCI compatible flags in FADT Remove the flags in FADT that were declaring we do not support MSI and PCI ASPM. Signed-off-by: Babis Chalios --- src/vmm/src/acpi/x86_64.rs | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/src/vmm/src/acpi/x86_64.rs b/src/vmm/src/acpi/x86_64.rs index de850a9989f..53eeac7b5e2 100644 --- a/src/vmm/src/acpi/x86_64.rs +++ b/src/vmm/src/acpi/x86_64.rs @@ -3,10 +3,7 @@ use std::mem::size_of; -use acpi_tables::fadt::{ - IAPC_BOOT_ARG_FLAGS_MSI_NOT_PRESENT, IAPC_BOOT_ARG_FLAGS_PCI_ASPM, - IAPC_BOOT_ARG_FLAGS_VGA_NOT_PRESENT, -}; +use acpi_tables::fadt::IAPC_BOOT_ARG_FLAGS_VGA_NOT_PRESENT; use acpi_tables::madt::{IoAPIC, LocalAPIC}; use acpi_tables::{Fadt, aml}; use vm_memory::GuestAddress; @@ -33,11 +30,7 @@ pub(crate) fn setup_arch_fadt(fadt: &mut Fadt) { // neither do we support ASPM, or MSI type of interrupts. // More info here: // https://uefi.org/specs/ACPI/6.5/05_ACPI_Software_Programming_Model.html?highlight=0a06#ia-pc-boot-architecture-flags - fadt.setup_iapc_flags( - (1 << IAPC_BOOT_ARG_FLAGS_VGA_NOT_PRESENT) - | (1 << IAPC_BOOT_ARG_FLAGS_PCI_ASPM) - | (1 << IAPC_BOOT_ARG_FLAGS_MSI_NOT_PRESENT), - ); + fadt.setup_iapc_flags(1 << IAPC_BOOT_ARG_FLAGS_VGA_NOT_PRESENT); } #[inline(always)] From 45ad79b4a9cda4abca3a65f4da05622bb620035e Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 3 Jun 2025 14:05:42 +0200 Subject: [PATCH 40/99] vmm: simplify device errors Merge the device-related errors that DeviceManager might return. This way, we can avoid adding yet another error type for PCI devices and reduce some the variants of StartMicrovmError. Suggested-by: Egor Lazarchuk Signed-off-by: Babis Chalios --- src/vmm/src/builder.rs | 25 ++++++++++--------------- src/vmm/src/device_manager/mod.rs | 31 +++++++++---------------------- 2 files changed, 19 insertions(+), 37 deletions(-) diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 9f4d82ed11a..a10f91a7fc5 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -22,12 +22,12 @@ use crate::arch::{ConfigurationError, configure_system_for_boot, load_kernel}; #[cfg(target_arch = "aarch64")] use crate::construct_kvm_mpidrs; use crate::cpu_config::templates::{GetCpuTemplate, GetCpuTemplateError, GuestConfigError}; -#[cfg(target_arch = "aarch64")] -use crate::device_manager::AttachLegacyMmioDeviceError; +#[cfg(target_arch = "x86_64")] +use crate::device_manager; use crate::device_manager::pci_mngr::PciManagerError; use crate::device_manager::{ - AttachMmioDeviceError, AttachVmgenidError, DeviceManager, DeviceManagerCreateError, - DevicePersistError, DeviceRestoreArgs, + AttachDeviceError, DeviceManager, DeviceManagerCreateError, DevicePersistError, + DeviceRestoreArgs, }; use crate::devices::acpi::vmgenid::VmGenIdError; use crate::devices::virtio::balloon::Balloon; @@ -48,18 +48,15 @@ use crate::vstate::kvm::{Kvm, KvmError}; use crate::vstate::memory::GuestRegionMmap; use crate::vstate::vcpu::VcpuError; use crate::vstate::vm::{Vm, VmError}; -use crate::{EventManager, Vmm, VmmError, device_manager}; +use crate::{EventManager, Vmm, VmmError}; /// Errors associated with starting the instance. #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum StartMicrovmError { /// Unable to attach block device to Vmm: {0} AttachBlockDevice(io::Error), - /// Unable to attach the VMGenID device: {0} - AttachVmgenidDevice(#[from] AttachVmgenidError), - #[cfg(target_arch = "aarch64")] - /// Unable to attach legacy MMIO devices: {0} - AttachLegacyDevices(#[from] AttachLegacyMmioDeviceError), + /// Could not attach device: {0} + AttachDevice(#[from] AttachDeviceError), /// System configuration error: {0} ConfigureSystem(#[from] ConfigurationError), /// Failed to create device manager: {0} @@ -104,8 +101,6 @@ pub enum StartMicrovmError { NetDeviceNotConfigured, /// Cannot open the block device backing file: {0} OpenBlockDevice(io::Error), - /// Cannot initialize a MMIO Device or add a device to the MMIO Bus or cmdline: {0} - RegisterMmioDevice(#[from] device_manager::AttachMmioDeviceError), /// Cannot restore microvm state: {0} RestoreMicrovmState(MicrovmStateError), /// Cannot set vm resources: {0} @@ -563,7 +558,7 @@ fn attach_entropy_device( cmdline: &mut LoaderKernelCmdline, entropy_device: &Arc>, event_manager: &mut EventManager, -) -> Result<(), AttachMmioDeviceError> { +) -> Result<(), AttachDeviceError> { let id = entropy_device .lock() .expect("Poisoned lock") @@ -625,7 +620,7 @@ fn attach_unixsock_vsock_device( cmdline: &mut LoaderKernelCmdline, unix_vsock: &Arc>>, event_manager: &mut EventManager, -) -> Result<(), AttachMmioDeviceError> { +) -> Result<(), AttachDeviceError> { let id = String::from(unix_vsock.lock().expect("Poisoned lock").id()); event_manager.add_subscriber(unix_vsock.clone()); // The device mutex mustn't be locked here otherwise it will deadlock. @@ -638,7 +633,7 @@ fn attach_balloon_device( cmdline: &mut LoaderKernelCmdline, balloon: &Arc>, event_manager: &mut EventManager, -) -> Result<(), AttachMmioDeviceError> { +) -> Result<(), AttachDeviceError> { let id = String::from(balloon.lock().expect("Poisoned lock").id()); event_manager.add_subscriber(balloon.clone()); // The device mutex mustn't be locked here otherwise it will deadlock. diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index a60a86ea7c3..8df4da2863d 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -64,34 +64,21 @@ pub enum DeviceManagerCreateError { #[derive(Debug, thiserror::Error, displaydoc::Display)] /// Error while attaching a VirtIO device -pub enum AttachMmioDeviceError { +pub enum AttachDeviceError { /// MMIO transport error: {0} MmioTransport(#[from] MmioError), /// Error inserting device in bus: {0} Bus(#[from] vm_device::BusError), -} - -#[derive(Debug, thiserror::Error, displaydoc::Display)] -/// Error while attaching the VMGenID device -pub enum AttachVmgenidError { /// Error creating VMGenID device: {0} CreateVmGenID(#[from] VmGenIdError), /// Error while registering VMGenID with KVM: {0} AttachVmGenID(#[from] kvm_ioctls::Error), -} - -#[cfg(target_arch = "aarch64")] -#[derive(Debug, thiserror::Error, displaydoc::Display)] -/// Error while attaching the VMGenID device -pub enum AttachLegacyMmioDeviceError { + #[cfg(target_arch = "aarch64")] /// Cmdline error Cmdline, + #[cfg(target_arch = "aarch64")] /// Error creating serial device: {0} CreateSerial(#[from] std::io::Error), - /// Error registering device: {0} - RegisterMMIODevice(#[from] MmioError), - /// Error inserting device in the Bus: {0} - Bus(#[from] vm_device::BusError), } #[derive(Debug)] @@ -181,7 +168,7 @@ impl DeviceManager { device: Arc>, cmdline: &mut Cmdline, is_vhost_user: bool, - ) -> Result<(), AttachMmioDeviceError> { + ) -> Result<(), AttachDeviceError> { let interrupt = Arc::new(IrqTrigger::new()); // The device mutex mustn't be locked here otherwise it will deadlock. let device = @@ -201,7 +188,7 @@ impl DeviceManager { pub(crate) fn attach_boot_timer_device( &mut self, request_ts: TimestampUs, - ) -> Result<(), AttachMmioDeviceError> { + ) -> Result<(), AttachDeviceError> { let boot_timer = Arc::new(Mutex::new(BootTimer::new(request_ts))); self.mmio_devices @@ -214,7 +201,7 @@ impl DeviceManager { &mut self, mem: &GuestMemoryMmap, vm: &Vm, - ) -> Result<(), AttachVmgenidError> { + ) -> Result<(), AttachDeviceError> { let vmgenid = VmGenId::new(mem, &self.resource_allocator)?; self.acpi_devices.attach_vmgenid(vmgenid, vm)?; Ok(()) @@ -226,13 +213,13 @@ impl DeviceManager { vm: &Vm, event_manager: &mut EventManager, cmdline: &mut Cmdline, - ) -> Result<(), AttachLegacyMmioDeviceError> { + ) -> Result<(), AttachDeviceError> { // Serial device setup. let cmdline_contains_console = cmdline .as_cstring() - .map_err(|_| AttachLegacyMmioDeviceError::Cmdline)? + .map_err(|_| AttachDeviceError::Cmdline)? .into_string() - .map_err(|_| AttachLegacyMmioDeviceError::Cmdline)? + .map_err(|_| AttachDeviceError::Cmdline)? .contains("console="); if cmdline_contains_console { From d8986a9199cfcb491d75a06d85fd13cf4c3ef79a Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Mon, 2 Jun 2025 20:48:16 +0200 Subject: [PATCH 41/99] pci: add virtio-pci transport implementation Add a VirtIO PCI transport implementation. When a Firecracker microVM is launched with --enable-pci, we will create all VirtIO devices using the PCI transport layer. Snapshotting of VirtIO PCI devices is not supported and we will add this functionality in later commit. Add a couple of tests that ensure that PCI configuration space is what expected. We read common fields and make sure the BAR we allocate for the VirtIO device is what expected. Signed-off-by: Babis Chalios --- Cargo.lock | 8 + src/vmm/Cargo.toml | 2 + src/vmm/src/builder.rs | 14 +- src/vmm/src/device_manager/mod.rs | 27 +- src/vmm/src/device_manager/pci_mngr.rs | 131 +- src/vmm/src/devices/virtio/device.rs | 2 +- src/vmm/src/devices/virtio/queue.rs | 13 + src/vmm/src/devices/virtio/transport/mod.rs | 2 + .../virtio/transport/pci/common_config.rs | 415 ++++++ .../devices/virtio/transport/pci/device.rs | 1279 +++++++++++++++++ .../src/devices/virtio/transport/pci/mod.rs | 5 + 11 files changed, 1887 insertions(+), 11 deletions(-) create mode 100644 src/vmm/src/devices/virtio/transport/pci/common_config.rs create mode 100644 src/vmm/src/devices/virtio/transport/pci/device.rs create mode 100644 src/vmm/src/devices/virtio/transport/pci/mod.rs diff --git a/Cargo.lock b/Cargo.lock index 8f20f8f2d50..c22fd8d9d04 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -112,6 +112,12 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "anyhow" +version = "1.0.98" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" + [[package]] name = "arrayvec" version = "0.7.6" @@ -1652,11 +1658,13 @@ version = "0.1.0" dependencies = [ "acpi_tables", "aes-gcm", + "anyhow", "arrayvec", "aws-lc-rs", "base64", "bincode", "bitflags 2.9.1", + "byteorder", "crc64", "criterion", "derive_more", diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index 6bfd64853b5..eeb71fd7d32 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -17,11 +17,13 @@ gdb = ["arrayvec", "gdbstub", "gdbstub_arch"] acpi_tables = { path = "../acpi-tables" } aes-gcm = { version = "0.10.1", default-features = false, features = ["aes"] } +anyhow = "1.0.98" arrayvec = { version = "0.7.6", optional = true } aws-lc-rs = { version = "1.13.3", features = ["bindgen"] } base64 = "0.22.1" bincode = { version = "2.0.1", features = ["serde"] } bitflags = "2.9.1" +byteorder = "1.5.0" crc64 = "2.0.0" derive_more = { version = "2.0.1", default-features = false, features = [ "from", diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index a10f91a7fc5..b9b60ea8895 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -169,6 +169,8 @@ pub fn build_microvm_for_boot( let mut device_manager = DeviceManager::new(event_manager, &vcpus_exit_evt, &vm)?; + let vm = Arc::new(vm); + let entry_point = load_kernel(&boot_config.kernel_file, vm.guest_memory())?; let initrd = InitrdConfig::from_config(boot_config, vm.guest_memory())?; @@ -271,7 +273,7 @@ pub fn build_microvm_for_boot( instance_info: instance_info.clone(), shutdown_exit_code: None, kvm, - vm: Arc::new(vm), + vm, uffd: None, vcpus_handles: Vec::new(), vcpus_exit_evt, @@ -554,7 +556,7 @@ fn setup_pvtime( fn attach_entropy_device( device_manager: &mut DeviceManager, - vm: &Vm, + vm: &Arc, cmdline: &mut LoaderKernelCmdline, entropy_device: &Arc>, event_manager: &mut EventManager, @@ -571,7 +573,7 @@ fn attach_entropy_device( fn attach_block_devices<'a, I: Iterator>> + Debug>( device_manager: &mut DeviceManager, - vm: &Vm, + vm: &Arc, cmdline: &mut LoaderKernelCmdline, blocks: I, event_manager: &mut EventManager, @@ -600,7 +602,7 @@ fn attach_block_devices<'a, I: Iterator>> + Debug>( fn attach_net_devices<'a, I: Iterator>> + Debug>( device_manager: &mut DeviceManager, - vm: &Vm, + vm: &Arc, cmdline: &mut LoaderKernelCmdline, net_devices: I, event_manager: &mut EventManager, @@ -616,7 +618,7 @@ fn attach_net_devices<'a, I: Iterator>> + Debug>( fn attach_unixsock_vsock_device( device_manager: &mut DeviceManager, - vm: &Vm, + vm: &Arc, cmdline: &mut LoaderKernelCmdline, unix_vsock: &Arc>>, event_manager: &mut EventManager, @@ -629,7 +631,7 @@ fn attach_unixsock_vsock_device( fn attach_balloon_device( device_manager: &mut DeviceManager, - vm: &Vm, + vm: &Arc, cmdline: &mut LoaderKernelCmdline, balloon: &Arc>, event_manager: &mut EventManager, diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index 8df4da2863d..da61db922c3 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -79,6 +79,8 @@ pub enum AttachDeviceError { #[cfg(target_arch = "aarch64")] /// Error creating serial device: {0} CreateSerial(#[from] std::io::Error), + /// Error attach PCI device: {0} + PciTransport(#[from] PciManagerError), } #[derive(Debug)] @@ -160,8 +162,10 @@ impl DeviceManager { }) } - /// Attaches a VirtioDevice device to the device manager and event manager. - pub(crate) fn attach_virtio_device( + /// Attaches an MMIO VirtioDevice device to the device manager and event manager. + pub(crate) fn attach_mmio_virtio_device< + T: 'static + VirtioDevice + MutEventSubscriber + Debug, + >( &mut self, vm: &Vm, id: String, @@ -184,6 +188,25 @@ impl DeviceManager { Ok(()) } + /// Attaches a VirtioDevice device to the device manager and event manager. + pub(crate) fn attach_virtio_device( + &mut self, + vm: &Arc, + id: String, + device: Arc>, + cmdline: &mut Cmdline, + is_vhost_user: bool, + ) -> Result<(), AttachDeviceError> { + if self.pci_devices.pci_segment.is_some() { + self.pci_devices + .attach_pci_virtio_device(vm, &self.resource_allocator, id, device)?; + } else { + self.attach_mmio_virtio_device(vm, id, device, cmdline, is_vhost_user)?; + } + + Ok(()) + } + /// Attaches a [`BootTimer`] to the VM pub(crate) fn attach_boot_timer_device( &mut self, diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs index e9ada60cc1f..686349858fb 100644 --- a/src/vmm/src/device_manager/pci_mngr.rs +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -1,18 +1,29 @@ // Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 -use std::sync::Arc; +use std::collections::HashMap; +use std::fmt::Debug; +use std::sync::{Arc, Mutex}; +use event_manager::MutEventSubscriber; +use log::debug; +use pci::{PciBarRegionType, PciDevice, PciDeviceError, PciRootError}; use serde::{Deserialize, Serialize}; use vm_device::BusError; -use super::resources::ResourceAllocator; +use crate::Vm; +use crate::device_manager::resources::ResourceAllocator; use crate::devices::pci::PciSegment; +use crate::devices::virtio::device::VirtioDevice; +use crate::devices::virtio::transport::pci::device::{VirtioPciDevice, VirtioPciDeviceError}; +use crate::vstate::vm::InterruptError; #[derive(Debug, Default)] pub struct PciDevices { /// PCIe segment of the VMM, if PCI is enabled. We currently support a single PCIe segment. pub pci_segment: Option, + /// All VirtIO PCI devices of the system + pub virtio_devices: HashMap<(u32, String), Arc>>, } #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -21,6 +32,16 @@ pub enum PciManagerError { ResourceAllocation(#[from] vm_allocator::Error), /// Bus error: {0} Bus(#[from] BusError), + /// PCI root error: {0} + PciRoot(#[from] PciRootError), + /// MSI error: {0} + Msi(#[from] InterruptError), + /// VirtIO PCI device error: {0} + VirtioPciDevice(#[from] VirtioPciDeviceError), + /// PCI device error: {0} + PciDeviceError(#[from] PciDeviceError), + /// KVM error: {0} + Kvm(#[from] vmm_sys_util::errno::Error), } impl PciDevices { @@ -61,6 +82,112 @@ impl PciDevices { Ok(()) } + + fn register_bars_with_bus( + resource_allocator: &ResourceAllocator, + virtio_device: &Arc>, + ) -> Result<(), PciManagerError> { + for bar in &virtio_device.lock().expect("Poisoned lock").bar_regions { + match bar.region_type() { + PciBarRegionType::IoRegion => { + debug!( + "Inserting I/O BAR region: {:#x}:{:#x}", + bar.addr(), + bar.size() + ); + #[cfg(target_arch = "x86_64")] + resource_allocator.pio_bus.insert( + virtio_device.clone(), + bar.addr(), + bar.size(), + )?; + #[cfg(target_arch = "aarch64")] + log::error!("pci: We do not support I/O region allocation") + } + PciBarRegionType::Memory32BitRegion | PciBarRegionType::Memory64BitRegion => { + debug!( + "Inserting MMIO BAR region: {:#x}:{:#x}", + bar.addr(), + bar.size() + ); + resource_allocator.mmio_bus.insert( + virtio_device.clone(), + bar.addr(), + bar.size(), + )?; + } + } + } + + Ok(()) + } + + pub(crate) fn attach_pci_virtio_device< + T: 'static + VirtioDevice + MutEventSubscriber + Debug, + >( + &mut self, + vm: &Arc, + resource_allocator: &ResourceAllocator, + id: String, + device: Arc>, + ) -> Result<(), PciManagerError> { + // We should only be reaching this point if PCI is enabled + let pci_segment = self.pci_segment.as_ref().unwrap(); + let pci_device_bdf = pci_segment.next_device_bdf()?; + debug!("Allocating BDF: {pci_device_bdf:?} for device"); + let mem = vm.guest_memory().clone(); + + // Allocate one MSI vector per queue, plus one for configuration + let msix_num = + u16::try_from(device.lock().expect("Poisoned lock").queues().len() + 1).unwrap(); + + let msix_vectors = Arc::new(Vm::create_msix_group( + vm.clone(), + resource_allocator, + msix_num, + )?); + + // Create the transport + let mut virtio_device = + VirtioPciDevice::new(id.clone(), mem, device, msix_vectors, pci_device_bdf.into())?; + + // Allocate bars + let mut mmio32_allocator = resource_allocator + .mmio32_memory + .lock() + .expect("Poisoned lock"); + let mut mmio64_allocator = resource_allocator + .mmio64_memory + .lock() + .expect("Poisoned lock"); + + virtio_device.allocate_bars(&mut mmio32_allocator, &mut mmio64_allocator, None)?; + + let virtio_device = Arc::new(Mutex::new(virtio_device)); + pci_segment + .pci_bus + .lock() + .expect("Poisoned lock") + .add_device(pci_device_bdf.device() as u32, virtio_device.clone())?; + + Self::register_bars_with_bus(resource_allocator, &virtio_device)?; + virtio_device + .lock() + .expect("Poisoned lock") + .register_notification_ioevent(vm)?; + + Ok(()) + } + + /// Gets the specified device. + pub fn get_virtio_device( + &self, + device_type: u32, + device_id: &str, + ) -> Option<&Arc>> { + self.virtio_devices + .get(&(device_type, device_id.to_string())) + } } #[derive(Default, Debug, Clone, Serialize, Deserialize)] diff --git a/src/vmm/src/devices/virtio/device.rs b/src/vmm/src/devices/virtio/device.rs index 0b09195d8f7..7b51a4b1dbf 100644 --- a/src/vmm/src/devices/virtio/device.rs +++ b/src/vmm/src/devices/virtio/device.rs @@ -148,7 +148,7 @@ pub trait VirtioDevice: AsAny + Send { /// Optionally deactivates this device and returns ownership of the guest memory map, interrupt /// event, and queue events. - fn reset(&mut self) -> Option<(EventFd, Vec)> { + fn reset(&mut self) -> Option<(Arc, Vec)> { None } diff --git a/src/vmm/src/devices/virtio/queue.rs b/src/vmm/src/devices/virtio/queue.rs index 79c635e5c4d..7fd862f45ca 100644 --- a/src/vmm/src/devices/virtio/queue.rs +++ b/src/vmm/src/devices/virtio/queue.rs @@ -669,6 +669,19 @@ impl Queue { new - used_event - Wrapping(1) < new - old } + + /// Resets the Virtio Queue + pub(crate) fn reset(&mut self) { + self.ready = false; + self.size = self.max_size; + self.desc_table_address = GuestAddress(0); + self.avail_ring_address = GuestAddress(0); + self.used_ring_address = GuestAddress(0); + self.next_avail = Wrapping(0); + self.next_used = Wrapping(0); + self.num_added = Wrapping(0); + self.uses_notif_suppression = false; + } } #[cfg(kani)] diff --git a/src/vmm/src/devices/virtio/transport/mod.rs b/src/vmm/src/devices/virtio/transport/mod.rs index d41ad943aa2..c16a7adbe9d 100644 --- a/src/vmm/src/devices/virtio/transport/mod.rs +++ b/src/vmm/src/devices/virtio/transport/mod.rs @@ -8,6 +8,8 @@ use vmm_sys_util::eventfd::EventFd; /// MMIO transport for VirtIO devices pub mod mmio; +/// PCI transport for VirtIO devices +pub mod pci; /// Represents the types of interrupts used by VirtIO devices #[derive(Debug, Clone)] diff --git a/src/vmm/src/devices/virtio/transport/pci/common_config.rs b/src/vmm/src/devices/virtio/transport/pci/common_config.rs new file mode 100644 index 00000000000..c8ee2d1d2a9 --- /dev/null +++ b/src/vmm/src/devices/virtio/transport/pci/common_config.rs @@ -0,0 +1,415 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +use std::sync::atomic::{AtomicU16, Ordering}; +use std::sync::{Arc, Mutex}; + +use byteorder::{ByteOrder, LittleEndian}; +use serde::{Deserialize, Serialize}; +use vm_memory::GuestAddress; + +use crate::devices::virtio::device::VirtioDevice; +use crate::devices::virtio::queue::Queue; +use crate::logger::{debug, error, info, trace, warn}; +pub const VIRTIO_PCI_COMMON_CONFIG_ID: &str = "virtio_pci_common_config"; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VirtioPciCommonConfigState { + pub driver_status: u8, + pub config_generation: u8, + pub device_feature_select: u32, + pub driver_feature_select: u32, + pub queue_select: u16, + pub msix_config: u16, + pub msix_queues: Vec, +} + +// The standard layout for the ring is a continuous chunk of memory which looks +// like this. We assume num is a power of 2. +// +// struct vring +// { +// // The actual descriptors (16 bytes each) +// struct vring_desc desc[num]; +// +// // A ring of available descriptor heads with free-running index. +// __virtio16 avail_flags; +// __virtio16 avail_idx; +// __virtio16 available[num]; +// __virtio16 used_event_idx; +// +// // Padding to the next align boundary. +// char pad[]; +// +// // A ring of used descriptor heads with free-running index. +// __virtio16 used_flags; +// __virtio16 used_idx; +// struct vring_used_elem used[num]; +// __virtio16 avail_event_idx; +// }; +// struct vring_desc { +// __virtio64 addr; +// __virtio32 len; +// __virtio16 flags; +// __virtio16 next; +// }; +// +// struct vring_avail { +// __virtio16 flags; +// __virtio16 idx; +// __virtio16 ring[]; +// }; +// +// // u32 is used here for ids for padding reasons. +// struct vring_used_elem { +// // Index of start of used descriptor chain. +// __virtio32 id; +// // Total length of the descriptor chain which was used (written to) +// __virtio32 len; +// }; +// +// Kernel header used for this reference: include/uapi/linux/virtio_ring.h +// Virtio Spec: https://docs.oasis-open.org/virtio/virtio/v1.2/csd01/virtio-v1.2-csd01.html +// +const VRING_DESC_ELEMENT_SIZE: usize = 16; +const VRING_AVAIL_ELEMENT_SIZE: usize = 2; +const VRING_USED_ELEMENT_SIZE: usize = 8; +#[derive(Debug)] +pub enum VringType { + Desc, + Avail, + Used, +} + +pub fn get_vring_size(t: VringType, queue_size: u16) -> u64 { + let (length_except_ring, element_size) = match t { + VringType::Desc => (0, VRING_DESC_ELEMENT_SIZE), + VringType::Avail => (6, VRING_AVAIL_ELEMENT_SIZE), + VringType::Used => (6, VRING_USED_ELEMENT_SIZE), + }; + (length_except_ring + element_size * queue_size as usize) as u64 +} + +/// Contains the data for reading and writing the common configuration structure of a virtio PCI +/// device. +/// +/// * Registers: +/// +/// ** About the whole device. +/// le32 device_feature_select; // 0x00 // read-write +/// le32 device_feature; // 0x04 // read-only for driver +/// le32 driver_feature_select; // 0x08 // read-write +/// le32 driver_feature; // 0x0C // read-write +/// le16 msix_config; // 0x10 // read-write +/// le16 num_queues; // 0x12 // read-only for driver +/// u8 device_status; // 0x14 // read-write (driver_status) +/// u8 config_generation; // 0x15 // read-only for driver +/// +/// ** About a specific virtqueue. +/// le16 queue_select; // 0x16 // read-write +/// le16 queue_size; // 0x18 // read-write, power of 2, or 0. +/// le16 queue_msix_vector; // 0x1A // read-write +/// le16 queue_enable; // 0x1C // read-write (Ready) +/// le16 queue_notify_off; // 0x1E // read-only for driver +/// le64 queue_desc; // 0x20 // read-write +/// le64 queue_avail; // 0x28 // read-write +/// le64 queue_used; // 0x30 // read-write +#[derive(Debug)] +pub struct VirtioPciCommonConfig { + pub driver_status: u8, + pub config_generation: u8, + pub device_feature_select: u32, + pub driver_feature_select: u32, + pub queue_select: u16, + pub msix_config: Arc, + pub msix_queues: Arc>>, +} + +impl VirtioPciCommonConfig { + pub fn new(state: VirtioPciCommonConfigState) -> Self { + VirtioPciCommonConfig { + driver_status: state.driver_status, + config_generation: state.config_generation, + device_feature_select: state.device_feature_select, + driver_feature_select: state.driver_feature_select, + queue_select: state.queue_select, + msix_config: Arc::new(AtomicU16::new(state.msix_config)), + msix_queues: Arc::new(Mutex::new(state.msix_queues)), + } + } + + fn state(&self) -> VirtioPciCommonConfigState { + VirtioPciCommonConfigState { + driver_status: self.driver_status, + config_generation: self.config_generation, + device_feature_select: self.device_feature_select, + driver_feature_select: self.driver_feature_select, + queue_select: self.queue_select, + msix_config: self.msix_config.load(Ordering::Acquire), + msix_queues: self.msix_queues.lock().unwrap().clone(), + } + } + + pub fn read(&mut self, offset: u64, data: &mut [u8], device: Arc>) { + assert!(data.len() <= 8); + + match data.len() { + 1 => { + let v = self.read_common_config_byte(offset); + data[0] = v; + } + 2 => { + let v = self.read_common_config_word(offset, device.lock().unwrap().queues()); + LittleEndian::write_u16(data, v); + } + 4 => { + let v = self.read_common_config_dword(offset, device); + LittleEndian::write_u32(data, v); + } + 8 => { + let v = self.read_common_config_qword(offset); + LittleEndian::write_u64(data, v); + } + _ => error!("invalid data length for virtio read: len {}", data.len()), + } + } + + pub fn write(&mut self, offset: u64, data: &[u8], device: Arc>) { + assert!(data.len() <= 8); + + match data.len() { + 1 => self.write_common_config_byte(offset, data[0]), + 2 => self.write_common_config_word( + offset, + LittleEndian::read_u16(data), + device.lock().unwrap().queues_mut(), + ), + 4 => self.write_common_config_dword(offset, LittleEndian::read_u32(data), device), + 8 => self.write_common_config_qword( + offset, + LittleEndian::read_u64(data), + device.lock().unwrap().queues_mut(), + ), + _ => error!("invalid data length for virtio write: len {}", data.len()), + } + } + + fn read_common_config_byte(&self, offset: u64) -> u8 { + debug!("read_common_config_byte: offset 0x{:x}", offset); + // The driver is only allowed to do aligned, properly sized access. + match offset { + 0x14 => self.driver_status, + 0x15 => self.config_generation, + _ => { + warn!("invalid virtio config byte read: 0x{:x}", offset); + 0 + } + } + } + + fn write_common_config_byte(&mut self, offset: u64, value: u8) { + debug!("write_common_config_byte: offset 0x{offset:x}: {value:x}"); + match offset { + 0x14 => self.driver_status = value, + _ => { + warn!("invalid virtio config byte write: 0x{:x}", offset); + } + } + } + + fn read_common_config_word(&self, offset: u64, queues: &[Queue]) -> u16 { + debug!("read_common_config_word: offset 0x{:x}", offset); + match offset { + 0x10 => self.msix_config.load(Ordering::Acquire), + 0x12 => queues.len().try_into().unwrap(), // num_queues + 0x16 => self.queue_select, + 0x18 => self.with_queue(queues, |q| q.size).unwrap_or(0), + 0x1a => self.msix_queues.lock().unwrap()[self.queue_select as usize], + 0x1c => u16::from(self.with_queue(queues, |q| q.ready).unwrap_or(false)), + 0x1e => self.queue_select, // notify_off + _ => { + warn!("invalid virtio register word read: 0x{:x}", offset); + 0 + } + } + } + + fn write_common_config_word(&mut self, offset: u64, value: u16, queues: &mut [Queue]) { + debug!("write_common_config_word: offset 0x{:x}", offset); + match offset { + 0x10 => self.msix_config.store(value, Ordering::Release), + 0x16 => self.queue_select = value, + 0x18 => self.with_queue_mut(queues, |q| q.size = value), + 0x1a => self.msix_queues.lock().unwrap()[self.queue_select as usize] = value, + 0x1c => self.with_queue_mut(queues, |q| { + q.ready = value == 1; + }), + _ => { + warn!("invalid virtio register word write: 0x{:x}", offset); + } + } + } + + fn read_common_config_dword(&self, offset: u64, device: Arc>) -> u32 { + debug!("read_common_config_dword: offset 0x{:x}", offset); + match offset { + 0x00 => self.device_feature_select, + 0x04 => { + let locked_device = device.lock().unwrap(); + // Only 64 bits of features (2 pages) are defined for now, so limit + // device_feature_select to avoid shifting by 64 or more bits. + if self.device_feature_select < 2 { + ((locked_device.avail_features() >> (self.device_feature_select * 32)) + & 0xffff_ffff) as u32 + } else { + 0 + } + } + 0x08 => self.driver_feature_select, + _ => { + warn!("invalid virtio register dword read: 0x{:x}", offset); + 0 + } + } + } + + fn write_common_config_dword( + &mut self, + offset: u64, + value: u32, + device: Arc>, + ) { + debug!("write_common_config_dword: offset 0x{:x}", offset); + fn hi(v: &mut GuestAddress, x: u32) { + *v = (*v & 0xffff_ffff) | (u64::from(x) << 32) + } + + fn lo(v: &mut GuestAddress, x: u32) { + *v = (*v & !0xffff_ffff) | u64::from(x) + } + + let mut locked_device = device.lock().unwrap(); + + match offset { + 0x00 => self.device_feature_select = value, + 0x08 => self.driver_feature_select = value, + 0x0c => locked_device.ack_features_by_page(self.driver_feature_select, value), + 0x20 => self.with_queue_mut(locked_device.queues_mut(), |q| { + lo(&mut q.desc_table_address, value) + }), + 0x24 => self.with_queue_mut(locked_device.queues_mut(), |q| { + hi(&mut q.desc_table_address, value) + }), + 0x28 => self.with_queue_mut(locked_device.queues_mut(), |q| { + lo(&mut q.avail_ring_address, value) + }), + 0x2c => self.with_queue_mut(locked_device.queues_mut(), |q| { + hi(&mut q.avail_ring_address, value) + }), + 0x30 => self.with_queue_mut(locked_device.queues_mut(), |q| { + lo(&mut q.used_ring_address, value) + }), + 0x34 => self.with_queue_mut(locked_device.queues_mut(), |q| { + hi(&mut q.used_ring_address, value) + }), + _ => { + warn!("invalid virtio register dword write: 0x{:x}", offset); + } + } + } + + fn read_common_config_qword(&self, _offset: u64) -> u64 { + debug!("read_common_config_qword: offset 0x{:x}", _offset); + 0 // Assume the guest has no reason to read write-only registers. + } + + fn write_common_config_qword(&mut self, offset: u64, value: u64, queues: &mut [Queue]) { + debug!("write_common_config_qword: offset 0x{:x}", offset); + + let low = Some((value & 0xffff_ffff) as u32); + let high = Some((value >> 32) as u32); + + match offset { + 0x20 => self.with_queue_mut(queues, |q| q.desc_table_address.0 = value), + 0x28 => self.with_queue_mut(queues, |q| q.avail_ring_address.0 = value), + 0x30 => self.with_queue_mut(queues, |q| q.used_ring_address.0 = value), + _ => { + warn!("invalid virtio register qword write: 0x{:x}", offset); + } + } + } + + fn with_queue(&self, queues: &[Queue], f: F) -> Option + where + F: FnOnce(&Queue) -> U, + { + queues.get(self.queue_select as usize).map(f) + } + + fn with_queue_mut(&self, queues: &mut [Queue], f: F) { + if let Some(queue) = queues.get_mut(self.queue_select as usize) { + f(queue); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::devices::virtio::transport::mmio::tests::DummyDevice; + + #[test] + fn write_base_regs() { + let mut regs = VirtioPciCommonConfig { + driver_status: 0xaa, + config_generation: 0x55, + device_feature_select: 0x0, + driver_feature_select: 0x0, + queue_select: 0xff, + msix_config: Arc::new(AtomicU16::new(0)), + msix_queues: Arc::new(Mutex::new(vec![0; 3])), + }; + + let dev = Arc::new(Mutex::new(DummyDevice::new())); + // Can set all bits of driver_status. + regs.write(0x14, &[0x55], dev.clone()); + let mut read_back = vec![0x00]; + regs.read(0x14, &mut read_back, dev.clone()); + assert_eq!(read_back[0], 0x55); + + // The config generation register is read only. + regs.write(0x15, &[0xaa], dev.clone()); + let mut read_back = vec![0x00]; + regs.read(0x15, &mut read_back, dev.clone()); + assert_eq!(read_back[0], 0x55); + + // Device features is read-only and passed through from the device. + regs.write(0x04, &[0, 0, 0, 0], dev.clone()); + let mut read_back = vec![0, 0, 0, 0]; + regs.read(0x04, &mut read_back, dev.clone()); + assert_eq!(LittleEndian::read_u32(&read_back), 0u32); + + // Feature select registers are read/write. + regs.write(0x00, &[1, 2, 3, 4], dev.clone()); + let mut read_back = vec![0, 0, 0, 0]; + regs.read(0x00, &mut read_back, dev.clone()); + assert_eq!(LittleEndian::read_u32(&read_back), 0x0403_0201); + regs.write(0x08, &[1, 2, 3, 4], dev.clone()); + let mut read_back = vec![0, 0, 0, 0]; + regs.read(0x08, &mut read_back, dev.clone()); + assert_eq!(LittleEndian::read_u32(&read_back), 0x0403_0201); + + // 'queue_select' can be read and written. + regs.write(0x16, &[0xaa, 0x55], dev.clone()); + let mut read_back = vec![0x00, 0x00]; + regs.read(0x16, &mut read_back, dev); + assert_eq!(read_back[0], 0xaa); + assert_eq!(read_back[1], 0x55); + } +} diff --git a/src/vmm/src/devices/virtio/transport/pci/device.rs b/src/vmm/src/devices/virtio/transport/pci/device.rs new file mode 100644 index 00000000000..20c169297fd --- /dev/null +++ b/src/vmm/src/devices/virtio/transport/pci/device.rs @@ -0,0 +1,1279 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +use std::any::Any; +use std::cmp; +use std::fmt::{Debug, Formatter}; +use std::io::Write; +use std::sync::atomic::{AtomicBool, AtomicU16, AtomicU32, AtomicUsize, Ordering}; +use std::sync::{Arc, Barrier, Mutex}; + +use anyhow::anyhow; +use kvm_ioctls::{IoEventAddress, NoDatamatch}; +use pci::{ + BarReprogrammingParams, MsixCap, MsixConfig, MsixConfigState, PciBarConfiguration, + PciBarRegionType, PciBdf, PciCapability, PciCapabilityId, PciClassCode, PciConfiguration, + PciConfigurationState, PciDevice, PciDeviceError, PciHeaderType, PciMassStorageSubclass, + PciNetworkControllerSubclass, PciSubclass, +}; +use serde::{Deserialize, Serialize}; +use thiserror::Error; +use vm_allocator::{AddressAllocator, AllocPolicy, RangeInclusive}; +use vm_device::interrupt::{InterruptIndex, InterruptSourceGroup, MsiIrqGroupConfig}; +use vm_device::{BusDevice, PciBarType, Resource}; +use vm_memory::{Address, ByteValued, GuestAddress, Le32}; +use vmm_sys_util::errno; +use vmm_sys_util::eventfd::EventFd; + +use crate::Vm; +use crate::device_manager::resources::ResourceAllocator; +use crate::devices::virtio::device::VirtioDevice; +use crate::devices::virtio::queue::Queue; +use crate::devices::virtio::transport::pci::common_config::{ + VirtioPciCommonConfig, VirtioPciCommonConfigState, +}; +use crate::devices::virtio::transport::{VirtioInterrupt, VirtioInterruptType}; +use crate::devices::virtio::{TYPE_BLOCK, TYPE_NET}; +use crate::logger::{debug, error}; +use crate::utils::u64_to_usize; +use crate::vstate::memory::GuestMemoryMmap; +use crate::vstate::vm::{InterruptError, MsiVectorGroup}; + +const DEVICE_INIT: u8 = 0x00; +const DEVICE_ACKNOWLEDGE: u8 = 0x01; +const DEVICE_DRIVER: u8 = 0x02; +const DEVICE_DRIVER_OK: u8 = 0x04; +const DEVICE_FEATURES_OK: u8 = 0x08; +const DEVICE_FAILED: u8 = 0x80; + +const VIRTIO_F_RING_INDIRECT_DESC: u32 = 28; +const VIRTIO_F_RING_EVENT_IDX: u32 = 29; +const VIRTIO_F_VERSION_1: u32 = 32; +const VIRTIO_F_IOMMU_PLATFORM: u32 = 33; +const VIRTIO_F_IN_ORDER: u32 = 35; +const VIRTIO_F_ORDER_PLATFORM: u32 = 36; +#[allow(dead_code)] +const VIRTIO_F_SR_IOV: u32 = 37; +const VIRTIO_F_NOTIFICATION_DATA: u32 = 38; + +/// Vector value used to disable MSI for a queue. +const VIRTQ_MSI_NO_VECTOR: u16 = 0xffff; + +enum PciCapabilityType { + Common = 1, + Notify = 2, + Isr = 3, + Device = 4, + Pci = 5, + SharedMemory = 8, +} + +// This offset represents the 2 bytes omitted from the VirtioPciCap structure +// as they are already handled through add_capability(). These 2 bytes are the +// fields cap_vndr (1 byte) and cap_next (1 byte) defined in the virtio spec. +const VIRTIO_PCI_CAP_OFFSET: usize = 2; + +#[allow(dead_code)] +#[repr(C, packed)] +#[derive(Debug, Clone, Copy, Default)] +struct VirtioPciCap { + cap_len: u8, // Generic PCI field: capability length + cfg_type: u8, // Identifies the structure. + pci_bar: u8, // Where to find it. + id: u8, // Multiple capabilities of the same type + padding: [u8; 2], // Pad to full dword. + offset: Le32, // Offset within bar. + length: Le32, // Length of the structure, in bytes. +} + +// SAFETY: All members are simple numbers and any value is valid. +unsafe impl ByteValued for VirtioPciCap {} + +impl PciCapability for VirtioPciCap { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::VendorSpecific + } +} + +const VIRTIO_PCI_CAP_LEN_OFFSET: u8 = 2; + +impl VirtioPciCap { + pub fn new(cfg_type: PciCapabilityType, pci_bar: u8, offset: u32, length: u32) -> Self { + VirtioPciCap { + cap_len: u8::try_from(std::mem::size_of::()).unwrap() + + VIRTIO_PCI_CAP_LEN_OFFSET, + cfg_type: cfg_type as u8, + pci_bar, + id: 0, + padding: [0; 2], + offset: Le32::from(offset), + length: Le32::from(length), + } + } +} + +#[allow(dead_code)] +#[repr(C, packed)] +#[derive(Clone, Copy, Default)] +struct VirtioPciNotifyCap { + cap: VirtioPciCap, + notify_off_multiplier: Le32, +} +// SAFETY: All members are simple numbers and any value is valid. +unsafe impl ByteValued for VirtioPciNotifyCap {} + +impl PciCapability for VirtioPciNotifyCap { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::VendorSpecific + } +} + +impl VirtioPciNotifyCap { + pub fn new( + cfg_type: PciCapabilityType, + pci_bar: u8, + offset: u32, + length: u32, + multiplier: Le32, + ) -> Self { + VirtioPciNotifyCap { + cap: VirtioPciCap { + cap_len: u8::try_from(std::mem::size_of::()).unwrap() + + VIRTIO_PCI_CAP_LEN_OFFSET, + cfg_type: cfg_type as u8, + pci_bar, + id: 0, + padding: [0; 2], + offset: Le32::from(offset), + length: Le32::from(length), + }, + notify_off_multiplier: multiplier, + } + } +} + +#[allow(dead_code)] +#[repr(C, packed)] +#[derive(Clone, Copy, Default)] +struct VirtioPciCap64 { + cap: VirtioPciCap, + offset_hi: Le32, + length_hi: Le32, +} +// SAFETY: All members are simple numbers and any value is valid. +unsafe impl ByteValued for VirtioPciCap64 {} + +impl PciCapability for VirtioPciCap64 { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::VendorSpecific + } +} + +impl VirtioPciCap64 { + pub fn new(cfg_type: PciCapabilityType, pci_bar: u8, id: u8, offset: u64, length: u64) -> Self { + VirtioPciCap64 { + cap: VirtioPciCap { + cap_len: u8::try_from(std::mem::size_of::()).unwrap() + + VIRTIO_PCI_CAP_LEN_OFFSET, + cfg_type: cfg_type as u8, + pci_bar, + id, + padding: [0; 2], + offset: Le32::from((offset & 0xffff_ffff) as u32), + length: Le32::from((length & 0xffff_ffff) as u32), + }, + offset_hi: Le32::from((offset >> 32) as u32), + length_hi: Le32::from((length >> 32) as u32), + } + } +} + +#[allow(dead_code)] +#[repr(C, packed)] +#[derive(Debug, Clone, Copy, Default)] +struct VirtioPciCfgCap { + cap: VirtioPciCap, + pci_cfg_data: [u8; 4], +} +// SAFETY: All members are simple numbers and any value is valid. +unsafe impl ByteValued for VirtioPciCfgCap {} + +impl PciCapability for VirtioPciCfgCap { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::VendorSpecific + } +} + +impl VirtioPciCfgCap { + fn new() -> Self { + VirtioPciCfgCap { + cap: VirtioPciCap::new(PciCapabilityType::Pci, 0, 0, 0), + ..Default::default() + } + } +} + +#[derive(Debug, Clone, Copy, Default)] +struct VirtioPciCfgCapInfo { + offset: usize, + cap: VirtioPciCfgCap, +} + +#[allow(dead_code)] +#[derive(Debug, Copy, Clone)] +pub enum PciVirtioSubclass { + NonTransitionalBase = 0xff, +} + +impl PciSubclass for PciVirtioSubclass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +// Allocate one bar for the structs pointed to by the capability structures. +// As per the PCI specification, because the same BAR shares MSI-X and non +// MSI-X structures, it is recommended to use 8KiB alignment for all those +// structures. +const COMMON_CONFIG_BAR_OFFSET: u64 = 0x0000; +const COMMON_CONFIG_SIZE: u64 = 56; +const ISR_CONFIG_BAR_OFFSET: u64 = 0x2000; +const ISR_CONFIG_SIZE: u64 = 1; +const DEVICE_CONFIG_BAR_OFFSET: u64 = 0x4000; +const DEVICE_CONFIG_SIZE: u64 = 0x1000; +const NOTIFICATION_BAR_OFFSET: u64 = 0x6000; +const NOTIFICATION_SIZE: u64 = 0x1000; +const MSIX_TABLE_BAR_OFFSET: u64 = 0x8000; +// The size is 256KiB because the table can hold up to 2048 entries, with each +// entry being 128 bits (4 DWORDS). +const MSIX_TABLE_SIZE: u64 = 0x40000; +const MSIX_PBA_BAR_OFFSET: u64 = 0x48000; +// The size is 2KiB because the Pending Bit Array has one bit per vector and it +// can support up to 2048 vectors. +const MSIX_PBA_SIZE: u64 = 0x800; +// The BAR size must be a power of 2. +const CAPABILITY_BAR_SIZE: u64 = 0x80000; +const VIRTIO_COMMON_BAR_INDEX: usize = 0; +const VIRTIO_SHM_BAR_INDEX: usize = 2; + +const NOTIFY_OFF_MULTIPLIER: u32 = 4; // A dword per notification address. + +const VIRTIO_PCI_VENDOR_ID: u16 = 0x1af4; +const VIRTIO_PCI_DEVICE_ID_BASE: u16 = 0x1040; // Add to device type to get device ID. + +#[derive(Debug, Serialize, Deserialize)] +struct QueueState { + max_size: u16, + size: u16, + ready: bool, + desc_table: u64, + avail_ring: u64, + used_ring: u64, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct VirtioPciDeviceState { + pub pci_device_bdf: PciBdf, + device_activated: bool, + queues: Vec, + interrupt_status: usize, + cap_pci_cfg_offset: usize, + cap_pci_cfg: Vec, +} + +#[derive(Debug, thiserror::Error, displaydoc::Display)] +pub enum VirtioPciDeviceError { + /// Failed creating VirtioPciDevice: {0} + CreateVirtioPciDevice(#[from] anyhow::Error), + /// Error creating MSI configuration: {0} + Msi(#[from] pci::MsixError), +} +pub type Result = std::result::Result; + +pub struct VirtioPciDevice { + id: String, + + // BDF assigned to the device + pci_device_bdf: PciBdf, + + // PCI configuration registers. + configuration: PciConfiguration, + + // virtio PCI common configuration + common_config: VirtioPciCommonConfig, + + // MSI-X config + msix_config: Option>>, + + // Number of MSI-X vectors + msix_num: u16, + + // Virtio device reference and status + device: Arc>, + device_activated: Arc, + + // PCI interrupts. + interrupt_status: Arc, + virtio_interrupt: Option>, + interrupt_source_group: Arc, + + // Guest memory + memory: GuestMemoryMmap, + + // Settings PCI BAR + settings_bar: u8, + + // Whether to use 64-bit bar location or 32-bit + use_64bit_bar: bool, + + // Add a dedicated structure to hold information about the very specific + // virtio-pci capability VIRTIO_PCI_CAP_PCI_CFG. This is needed to support + // the legacy/backward compatible mechanism of letting the guest access the + // other virtio capabilities without mapping the PCI BARs. This can be + // needed when the guest tries to early access the virtio configuration of + // a device. + cap_pci_cfg_info: VirtioPciCfgCapInfo, + + // Details of bar regions to free + pub bar_regions: Vec, +} + +impl Debug for VirtioPciDevice { + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + f.debug_struct("VirtioPciDevice") + .field("id", &self.id) + .finish() + } +} + +impl VirtioPciDevice { + fn pci_configuration( + virtio_device_type: u32, + msix_config: &Arc>, + pci_config_state: Option, + ) -> PciConfiguration { + let pci_device_id = VIRTIO_PCI_DEVICE_ID_BASE + u16::try_from(virtio_device_type).unwrap(); + let (class, subclass) = match virtio_device_type { + TYPE_NET => ( + PciClassCode::NetworkController, + &PciNetworkControllerSubclass::EthernetController as &dyn PciSubclass, + ), + TYPE_BLOCK => ( + PciClassCode::MassStorage, + &PciMassStorageSubclass::MassStorage as &dyn PciSubclass, + ), + _ => ( + PciClassCode::Other, + &PciVirtioSubclass::NonTransitionalBase as &dyn PciSubclass, + ), + }; + + PciConfiguration::new( + VIRTIO_PCI_VENDOR_ID, + pci_device_id, + 0x1, // For modern virtio-PCI devices + class, + subclass, + None, + PciHeaderType::Device, + VIRTIO_PCI_VENDOR_ID, + pci_device_id, + Some(msix_config.clone()), + pci_config_state, + ) + } + + fn msix_config( + pci_device_bdf: u32, + msix_vectors: Arc, + msix_config_state: Option, + ) -> Result>> { + let msix_config = Arc::new(Mutex::new(MsixConfig::new( + msix_vectors.num_vectors(), + msix_vectors, + pci_device_bdf, + msix_config_state, + )?)); + + Ok(msix_config) + } + + /// Constructs a new PCI transport for the given virtio device. + #[allow(clippy::too_many_arguments)] + pub fn new( + id: String, + memory: GuestMemoryMmap, + device: Arc>, + msi_vectors: Arc, + pci_device_bdf: u32, + ) -> Result { + let num_queues = device.lock().expect("Poisoned lock").queues().len(); + + let msix_config = Self::msix_config(pci_device_bdf, msi_vectors.clone(), None)?; + let pci_config = Self::pci_configuration( + device.lock().expect("Poisoned lock").device_type(), + &msix_config, + None, + ); + + let virtio_common_config = VirtioPciCommonConfig::new(VirtioPciCommonConfigState { + driver_status: 0, + config_generation: 0, + device_feature_select: 0, + driver_feature_select: 0, + queue_select: 0, + msix_config: VIRTQ_MSI_NO_VECTOR, + msix_queues: vec![VIRTQ_MSI_NO_VECTOR; num_queues], + }); + let interrupt = Arc::new(VirtioInterruptMsix::new( + msix_config.clone(), + virtio_common_config.msix_config.clone(), + virtio_common_config.msix_queues.clone(), + msi_vectors.clone(), + )); + + let virtio_pci_device = VirtioPciDevice { + id, + pci_device_bdf: pci_device_bdf.into(), + configuration: pci_config, + common_config: virtio_common_config, + msix_config: Some(msix_config), + msix_num: msi_vectors.num_vectors(), + device, + device_activated: Arc::new(AtomicBool::new(false)), + interrupt_status: Arc::new(AtomicUsize::new(0)), + virtio_interrupt: None, + memory, + settings_bar: 0, + use_64bit_bar: true, + interrupt_source_group: msi_vectors, + cap_pci_cfg_info: VirtioPciCfgCapInfo::default(), + bar_regions: vec![], + }; + + Ok(virtio_pci_device) + } + + fn is_driver_ready(&self) -> bool { + let ready_bits = + (DEVICE_ACKNOWLEDGE | DEVICE_DRIVER | DEVICE_DRIVER_OK | DEVICE_FEATURES_OK); + self.common_config.driver_status == ready_bits + && self.common_config.driver_status & DEVICE_FAILED == 0 + } + + /// Determines if the driver has requested the device (re)init / reset itself + fn is_driver_init(&self) -> bool { + self.common_config.driver_status == DEVICE_INIT + } + + pub fn config_bar_addr(&self) -> u64 { + self.configuration.get_bar_addr(self.settings_bar as usize) + } + + fn add_pci_capabilities( + &mut self, + settings_bar: u8, + ) -> std::result::Result<(), PciDeviceError> { + // Add pointers to the different configuration structures from the PCI capabilities. + let common_cap = VirtioPciCap::new( + PciCapabilityType::Common, + settings_bar, + COMMON_CONFIG_BAR_OFFSET.try_into().unwrap(), + COMMON_CONFIG_SIZE.try_into().unwrap(), + ); + self.configuration + .add_capability(&common_cap) + .map_err(PciDeviceError::CapabilitiesSetup)?; + + let isr_cap = VirtioPciCap::new( + PciCapabilityType::Isr, + settings_bar, + ISR_CONFIG_BAR_OFFSET.try_into().unwrap(), + ISR_CONFIG_SIZE.try_into().unwrap(), + ); + self.configuration + .add_capability(&isr_cap) + .map_err(PciDeviceError::CapabilitiesSetup)?; + + // TODO(dgreid) - set based on device's configuration size? + let device_cap = VirtioPciCap::new( + PciCapabilityType::Device, + settings_bar, + DEVICE_CONFIG_BAR_OFFSET.try_into().unwrap(), + DEVICE_CONFIG_SIZE.try_into().unwrap(), + ); + self.configuration + .add_capability(&device_cap) + .map_err(PciDeviceError::CapabilitiesSetup)?; + + let notify_cap = VirtioPciNotifyCap::new( + PciCapabilityType::Notify, + settings_bar, + NOTIFICATION_BAR_OFFSET.try_into().unwrap(), + NOTIFICATION_SIZE.try_into().unwrap(), + Le32::from(NOTIFY_OFF_MULTIPLIER), + ); + self.configuration + .add_capability(¬ify_cap) + .map_err(PciDeviceError::CapabilitiesSetup)?; + + let configuration_cap = VirtioPciCfgCap::new(); + self.cap_pci_cfg_info.offset = self + .configuration + .add_capability(&configuration_cap) + .map_err(PciDeviceError::CapabilitiesSetup)? + + VIRTIO_PCI_CAP_OFFSET; + self.cap_pci_cfg_info.cap = configuration_cap; + + if self.msix_config.is_some() { + let msix_cap = MsixCap::new( + settings_bar, + self.msix_num, + MSIX_TABLE_BAR_OFFSET.try_into().unwrap(), + settings_bar, + MSIX_PBA_BAR_OFFSET.try_into().unwrap(), + ); + self.configuration + .add_capability(&msix_cap) + .map_err(PciDeviceError::CapabilitiesSetup)?; + } + + self.settings_bar = settings_bar; + Ok(()) + } + + fn read_cap_pci_cfg(&mut self, offset: usize, mut data: &mut [u8]) { + let cap_slice = self.cap_pci_cfg_info.cap.as_slice(); + let data_len = data.len(); + let cap_len = cap_slice.len(); + if offset + data_len > cap_len { + error!("Failed to read cap_pci_cfg from config space"); + return; + } + + if offset < std::mem::size_of::() { + if let Some(end) = offset.checked_add(data_len) { + // This write can't fail, offset and end are checked against config_len. + data.write_all(&cap_slice[offset..cmp::min(end, cap_len)]) + .unwrap(); + } + } else { + let bar_offset: u32 = + // SAFETY: we know self.cap_pci_cfg_info.cap.cap.offset is 32bits long. + unsafe { std::mem::transmute(self.cap_pci_cfg_info.cap.cap.offset) }; + self.read_bar(0, bar_offset as u64, data) + } + } + + fn write_cap_pci_cfg(&mut self, offset: usize, data: &[u8]) -> Option> { + let cap_slice = self.cap_pci_cfg_info.cap.as_mut_slice(); + let data_len = data.len(); + let cap_len = cap_slice.len(); + if offset + data_len > cap_len { + error!("Failed to write cap_pci_cfg to config space"); + return None; + } + + if offset < std::mem::size_of::() { + let (_, right) = cap_slice.split_at_mut(offset); + right[..data_len].copy_from_slice(data); + None + } else { + let bar_offset: u32 = + // SAFETY: we know self.cap_pci_cfg_info.cap.cap.offset is 32bits long. + unsafe { std::mem::transmute(self.cap_pci_cfg_info.cap.cap.offset) }; + self.write_bar(0, bar_offset as u64, data) + } + } + + pub fn virtio_device(&self) -> Arc> { + self.device.clone() + } + + fn needs_activation(&self) -> bool { + !self.device_activated.load(Ordering::SeqCst) && self.is_driver_ready() + } + + /// Register the IoEvent notification for a VirtIO device + pub fn register_notification_ioevent(&self, vm: &Vm) -> std::result::Result<(), errno::Error> { + let bar_addr = self.config_bar_addr(); + for (i, queue_evt) in self + .device + .lock() + .expect("Poisoned lock") + .queue_events() + .iter() + .enumerate() + { + let notify_base = bar_addr + NOTIFICATION_BAR_OFFSET; + let io_addr = + IoEventAddress::Mmio(notify_base + i as u64 * NOTIFY_OFF_MULTIPLIER as u64); + vm.fd().register_ioevent(queue_evt, &io_addr, NoDatamatch)?; + } + Ok(()) + } + + /// Unregister the IoEvent notification for a VirtIO device + pub fn unregister_notification_ioevent( + &self, + vm: &Vm, + ) -> std::result::Result<(), errno::Error> { + let bar_addr = self.config_bar_addr(); + for (i, queue_evt) in self + .device + .lock() + .expect("Poisoned lock") + .queue_events() + .iter() + .enumerate() + { + let notify_base = bar_addr + NOTIFICATION_BAR_OFFSET; + let io_addr = + IoEventAddress::Mmio(notify_base + i as u64 * NOTIFY_OFF_MULTIPLIER as u64); + vm.fd() + .unregister_ioevent(queue_evt, &io_addr, NoDatamatch)?; + } + Ok(()) + } +} + +pub struct VirtioInterruptMsix { + msix_config: Arc>, + config_vector: Arc, + queues_vectors: Arc>>, + interrupt_source_group: Arc, +} + +impl std::fmt::Debug for VirtioInterruptMsix { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("VirtioInterruptMsix") + .field("msix_config", &self.msix_config) + .field("config_vector", &self.config_vector) + .field("queues_vectors", &self.queues_vectors) + .finish() + } +} + +impl VirtioInterruptMsix { + pub fn new( + msix_config: Arc>, + config_vector: Arc, + queues_vectors: Arc>>, + interrupt_source_group: Arc, + ) -> Self { + VirtioInterruptMsix { + msix_config, + config_vector, + queues_vectors, + interrupt_source_group, + } + } +} + +impl VirtioInterrupt for VirtioInterruptMsix { + fn trigger(&self, int_type: VirtioInterruptType) -> std::result::Result<(), std::io::Error> { + let vector = match int_type { + VirtioInterruptType::Config => self.config_vector.load(Ordering::Acquire), + VirtioInterruptType::Queue(queue_index) => { + self.queues_vectors.lock().unwrap()[queue_index as usize] + } + }; + + if vector == VIRTQ_MSI_NO_VECTOR { + return Ok(()); + } + + let config = &mut self.msix_config.lock().unwrap(); + let entry = &config.table_entries[vector as usize]; + // In case the vector control register associated with the entry + // has its first bit set, this means the vector is masked and the + // device should not inject the interrupt. + // Instead, the Pending Bit Array table is updated to reflect there + // is a pending interrupt for this specific vector. + if config.masked() || entry.masked() { + config.set_pba_bit(vector, false); + return Ok(()); + } + + self.interrupt_source_group + .trigger(vector as InterruptIndex) + } + + fn notifier(&self, int_type: VirtioInterruptType) -> Option<&EventFd> { + let vector = match int_type { + VirtioInterruptType::Config => self.config_vector.load(Ordering::Acquire), + VirtioInterruptType::Queue(queue_index) => { + self.queues_vectors.lock().unwrap()[queue_index as usize] + } + }; + + self.interrupt_source_group + .notifier(vector as InterruptIndex) + } + + fn status(&self) -> Arc { + Arc::new(AtomicU32::new(0)) + } + + #[cfg(test)] + fn has_pending_interrupt(&self, interrupt_type: VirtioInterruptType) -> bool { + false + } +} + +impl PciDevice for VirtioPciDevice { + fn write_config_register( + &mut self, + reg_idx: usize, + offset: u64, + data: &[u8], + ) -> Option> { + // Handle the special case where the capability VIRTIO_PCI_CAP_PCI_CFG + // is accessed. This capability has a special meaning as it allows the + // guest to access other capabilities without mapping the PCI BAR. + let base = reg_idx * 4; + if base + u64_to_usize(offset) >= self.cap_pci_cfg_info.offset + && base + u64_to_usize(offset) + data.len() + <= self.cap_pci_cfg_info.offset + self.cap_pci_cfg_info.cap.bytes().len() + { + let offset = base + u64_to_usize(offset) - self.cap_pci_cfg_info.offset; + self.write_cap_pci_cfg(offset, data) + } else { + self.configuration + .write_config_register(reg_idx, offset, data); + None + } + } + + fn read_config_register(&mut self, reg_idx: usize) -> u32 { + // Handle the special case where the capability VIRTIO_PCI_CAP_PCI_CFG + // is accessed. This capability has a special meaning as it allows the + // guest to access other capabilities without mapping the PCI BAR. + let base = reg_idx * 4; + if base >= self.cap_pci_cfg_info.offset + && base + 4 <= self.cap_pci_cfg_info.offset + self.cap_pci_cfg_info.cap.bytes().len() + { + let offset = base - self.cap_pci_cfg_info.offset; + let mut data = [0u8; 4]; + self.read_cap_pci_cfg(offset, &mut data); + u32::from_le_bytes(data) + } else { + self.configuration.read_reg(reg_idx) + } + } + + fn detect_bar_reprogramming( + &mut self, + reg_idx: usize, + data: &[u8], + ) -> Option { + self.configuration.detect_bar_reprogramming(reg_idx, data) + } + + fn allocate_bars( + &mut self, + mmio32_allocator: &mut AddressAllocator, + mmio64_allocator: &mut AddressAllocator, + resources: Option>, + ) -> std::result::Result, PciDeviceError> { + let mut bars = Vec::new(); + let device_clone = self.device.clone(); + let device = device_clone.lock().unwrap(); + + let mut settings_bar_addr = None; + let mut use_64bit_bar = self.use_64bit_bar; + let restoring = resources.is_some(); + if let Some(resources) = resources { + for resource in resources { + if let Resource::PciBar { + index, base, type_, .. + } = resource + { + if index == VIRTIO_COMMON_BAR_INDEX { + settings_bar_addr = Some(GuestAddress(base)); + use_64bit_bar = match type_ { + PciBarType::Io => { + return Err(PciDeviceError::InvalidResource(resource)); + } + PciBarType::Mmio32 => false, + PciBarType::Mmio64 => true, + }; + break; + } + } + } + // Error out if no resource was matching the BAR id. + if settings_bar_addr.is_none() { + return Err(PciDeviceError::MissingResource); + } + } + + // Allocate the virtio-pci capability BAR. + // See http://docs.oasis-open.org/virtio/virtio/v1.0/cs04/virtio-v1.0-cs04.html#x1-740004 + let policy = match settings_bar_addr { + Some(addr) => AllocPolicy::ExactMatch(addr.0), + None => AllocPolicy::FirstMatch, + }; + let (virtio_pci_bar_addr, region_type) = if use_64bit_bar { + let region_type = PciBarRegionType::Memory64BitRegion; + let addr = mmio64_allocator + .allocate(CAPABILITY_BAR_SIZE, CAPABILITY_BAR_SIZE, policy) + .unwrap() + .start(); + (addr, region_type) + } else { + let region_type = PciBarRegionType::Memory32BitRegion; + let addr = mmio32_allocator + .allocate(CAPABILITY_BAR_SIZE, CAPABILITY_BAR_SIZE, policy) + .unwrap() + .start(); + (addr, region_type) + }; + + let bar = PciBarConfiguration::default() + .set_index(VIRTIO_COMMON_BAR_INDEX) + .set_address(virtio_pci_bar_addr) + .set_size(CAPABILITY_BAR_SIZE) + .set_region_type(region_type); + + // The creation of the PCI BAR and its associated capabilities must + // happen only during the creation of a brand new VM. When a VM is + // restored from a known state, the BARs are already created with the + // right content, therefore we don't need to go through this codepath. + if !restoring { + self.configuration + .add_pci_bar(&bar) + .map_err(|e| PciDeviceError::IoRegistrationFailed(virtio_pci_bar_addr, e))?; + + // Once the BARs are allocated, the capabilities can be added to the PCI configuration. + self.add_pci_capabilities(VIRTIO_COMMON_BAR_INDEX.try_into().unwrap())?; + } + + bars.push(bar); + + self.bar_regions.clone_from(&bars); + + Ok(bars) + } + + fn free_bars( + &mut self, + mmio32_allocator: &mut AddressAllocator, + mmio64_allocator: &mut AddressAllocator, + ) -> std::result::Result<(), PciDeviceError> { + for bar in self.bar_regions.drain(..) { + let range = RangeInclusive::new(bar.addr(), bar.addr() + bar.size()).unwrap(); + match bar.region_type() { + PciBarRegionType::Memory32BitRegion => { + mmio32_allocator.free(&range); + } + PciBarRegionType::Memory64BitRegion => { + mmio64_allocator.free(&range); + } + _ => error!("Unexpected PCI bar type"), + } + } + Ok(()) + } + + fn move_bar( + &mut self, + old_base: u64, + new_base: u64, + ) -> std::result::Result<(), std::io::Error> { + // We only update our idea of the bar in order to support free_bars() above. + // The majority of the reallocation is done inside DeviceManager. + for bar in self.bar_regions.iter_mut() { + if bar.addr() == old_base { + *bar = bar.set_address(new_base); + } + } + + Ok(()) + } + + fn read_bar(&mut self, _base: u64, offset: u64, data: &mut [u8]) { + match offset { + o if o < COMMON_CONFIG_BAR_OFFSET + COMMON_CONFIG_SIZE => { + self.common_config + .read(o - COMMON_CONFIG_BAR_OFFSET, data, self.device.clone()) + } + o if (ISR_CONFIG_BAR_OFFSET..ISR_CONFIG_BAR_OFFSET + ISR_CONFIG_SIZE).contains(&o) => { + if let Some(v) = data.get_mut(0) { + // Reading this register resets it to 0. + *v = self + .interrupt_status + .swap(0, Ordering::AcqRel) + .try_into() + .unwrap(); + } + } + o if (DEVICE_CONFIG_BAR_OFFSET..DEVICE_CONFIG_BAR_OFFSET + DEVICE_CONFIG_SIZE) + .contains(&o) => + { + let device = self.device.lock().unwrap(); + device.read_config(o - DEVICE_CONFIG_BAR_OFFSET, data); + } + o if (NOTIFICATION_BAR_OFFSET..NOTIFICATION_BAR_OFFSET + NOTIFICATION_SIZE) + .contains(&o) => + { + // Handled with ioeventfds. + } + o if (MSIX_TABLE_BAR_OFFSET..MSIX_TABLE_BAR_OFFSET + MSIX_TABLE_SIZE).contains(&o) => { + if let Some(msix_config) = &self.msix_config { + msix_config + .lock() + .unwrap() + .read_table(o - MSIX_TABLE_BAR_OFFSET, data); + } + } + o if (MSIX_PBA_BAR_OFFSET..MSIX_PBA_BAR_OFFSET + MSIX_PBA_SIZE).contains(&o) => { + if let Some(msix_config) = &self.msix_config { + msix_config + .lock() + .unwrap() + .read_pba(o - MSIX_PBA_BAR_OFFSET, data); + } + } + _ => (), + } + } + + fn write_bar(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { + match offset { + o if o < COMMON_CONFIG_BAR_OFFSET + COMMON_CONFIG_SIZE => { + self.common_config + .write(o - COMMON_CONFIG_BAR_OFFSET, data, self.device.clone()) + } + o if (ISR_CONFIG_BAR_OFFSET..ISR_CONFIG_BAR_OFFSET + ISR_CONFIG_SIZE).contains(&o) => { + if let Some(v) = data.first() { + self.interrupt_status + .fetch_and(!(*v as usize), Ordering::AcqRel); + } + } + o if (DEVICE_CONFIG_BAR_OFFSET..DEVICE_CONFIG_BAR_OFFSET + DEVICE_CONFIG_SIZE) + .contains(&o) => + { + let mut device = self.device.lock().unwrap(); + device.write_config(o - DEVICE_CONFIG_BAR_OFFSET, data); + } + o if (NOTIFICATION_BAR_OFFSET..NOTIFICATION_BAR_OFFSET + NOTIFICATION_SIZE) + .contains(&o) => + { + // Handled with ioeventfds. + error!("Unexpected write to notification BAR: offset = 0x{:x}", o); + } + o if (MSIX_TABLE_BAR_OFFSET..MSIX_TABLE_BAR_OFFSET + MSIX_TABLE_SIZE).contains(&o) => { + if let Some(msix_config) = &self.msix_config { + msix_config + .lock() + .unwrap() + .write_table(o - MSIX_TABLE_BAR_OFFSET, data); + } + } + o if (MSIX_PBA_BAR_OFFSET..MSIX_PBA_BAR_OFFSET + MSIX_PBA_SIZE).contains(&o) => { + if let Some(msix_config) = &self.msix_config { + msix_config + .lock() + .unwrap() + .write_pba(o - MSIX_PBA_BAR_OFFSET, data); + } + } + _ => (), + }; + + // Try and activate the device if the driver status has changed + if self.needs_activation() { + debug!("Activating device"); + self.virtio_device() + .lock() + .unwrap() + .activate( + self.memory.clone(), + Arc::clone(self.virtio_interrupt.as_ref().unwrap()), + ) + .unwrap_or_else(|err| error!("Error activating device: {err:?}")); + } else { + debug!("Device doesn't need activation"); + } + + // Device has been reset by the driver + if self.device_activated.load(Ordering::SeqCst) && self.is_driver_init() { + let mut device = self.device.lock().unwrap(); + let reset_result = device.reset(); + match reset_result { + Some((virtio_interrupt, mut _queue_evts)) => { + // Upon reset the device returns its interrupt EventFD + self.virtio_interrupt = Some(virtio_interrupt); + self.device_activated.store(false, Ordering::SeqCst); + + // Reset queue readiness (changes queue_enable), queue sizes + // and selected_queue as per spec for reset + self.virtio_device() + .lock() + .unwrap() + .queues_mut() + .iter_mut() + .for_each(Queue::reset); + self.common_config.queue_select = 0; + } + None => { + error!("Attempt to reset device when not implemented in underlying device"); + self.common_config.driver_status = DEVICE_FAILED; + } + } + } + + None + } + + fn id(&self) -> Option { + Some(self.id.clone()) + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } +} + +impl BusDevice for VirtioPciDevice { + fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) { + self.read_bar(base, offset, data) + } + + fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option> { + self.write_bar(base, offset, data) + } +} + +#[cfg(test)] +mod tests { + use std::sync::{Arc, Mutex}; + + use event_manager::MutEventSubscriber; + use linux_loader::loader::Cmdline; + use pci::{PciBdf, PciClassCode, PciDevice, PciSubclass}; + + use super::VirtioPciDevice; + use crate::Vm; + use crate::arch::MEM_64BIT_DEVICES_START; + use crate::builder::tests::default_vmm; + use crate::devices::virtio::device::VirtioDevice; + use crate::devices::virtio::rng::Entropy; + use crate::devices::virtio::transport::pci::device::PciVirtioSubclass; + use crate::rate_limiter::RateLimiter; + + #[test] + fn test_pci_device_config() { + let mut vmm = default_vmm(); + vmm.device_manager.enable_pci(); + let entropy = Arc::new(Mutex::new(Entropy::new(RateLimiter::default()).unwrap())); + vmm.device_manager + .attach_virtio_device( + &vmm.vm, + "rng".to_string(), + entropy.clone(), + &mut Cmdline::new(1024).unwrap(), + false, + ) + .unwrap(); + + let device = vmm + .device_manager + .pci_devices + .get_virtio_device(entropy.lock().unwrap().device_type(), "rng") + .unwrap(); + + let mut locked_virtio_pci_device = device.lock().unwrap(); + + // For more information for the values we are checking here look into the VirtIO spec here: + // https://docs.oasis-open.org/virtio/virtio/v1.1/csprd01/virtio-v1.1-csprd01.html#x1-1220007 + // and PCI Header type 0 layout here: https://wiki.osdev.org/PCI#Configuration_Space + + // | 16 bits | 16 bits | + // |-----------|-----------| + // regiger 0x0: | Device ID | Vendor ID | + // + // Vendor ID of VirtIO devices is 0x1af4 + let reg0 = locked_virtio_pci_device.read_config_register(0); + assert_eq!(reg0 & 0xffff, 0x1af4); + // VirtIO PCI device IDs are in the range [0x1000, 0x107f]. (We are not using transitional + // device IDs). + let devid = reg0 >> 16; + assert!( + (0x1000..=0x107f).contains(&devid), + "Device ID check: {:#x} >= 0x1000 && {:#x} <= 0x107f", + devid, + devid + ); + + // | 16 bits | 16 bits | + // |------------|-----------| + // regiger 0x1: | Status | Command | + // We offer the capabilities list (bit 4 of status register) at offset 0x34 + let reg1 = locked_virtio_pci_device.read_config_register(1); + assert_eq!(reg1, 0x0010_0000); + + // | 8 bits | 8 bits | 8 bits | 8 bits | + // register 0x2: | Class code | Subclass | Prog IF | Revision ID | + // + // Class code: VIRTIO_PCI_VENDOR_ID for all VirtIO devices + // Subclass: PciClassCode::NetworkController for net, PciClassCode::MassStore for block + // PciClassCode::Other for everything else + // Prog IF: A register defining some programmable interface register. 0 for VirtIO devices + // Revision ID: 0x1 for modern VirtIO devices + let reg2 = locked_virtio_pci_device.read_config_register(2); + assert_eq!(reg2, 0xffff_0001); + let class_code = ((reg2 >> 24) & 0xff) as u8; + assert_eq!(class_code, PciClassCode::Other.get_register_value()); + let subclass = ((reg2 >> 16) & 0xff) as u8; + assert_eq!( + subclass, + PciVirtioSubclass::NonTransitionalBase.get_register_value() + ); + let prog_if = ((reg2 >> 8) & 0xff) as u8; + assert_eq!(prog_if, 0); + let revision_id = reg2 & 0xff; + assert_eq!(revision_id, 0x1); + + // | 8 bits | 8 bits | 8 bits | 8 bits | + // register 0x3: | BIST | Header Type | Latency timer | Cache line size | + // + // BIST: status and control for self test of PCI devices. Always 0 for VirtIO devices + // HeaderType: 0x0 for general devices + // LatencyTimer: Latency timer in units of PCI bus clocks, 0 for VirtIO + // Cache Line size: 0 for VirtIO devices + let reg3 = locked_virtio_pci_device.read_config_register(3); + assert_eq!(reg3, 0x0); + + // register 0xa: Cardbus CIS pointer + // + // We don't emulate CardBus + let reg10 = locked_virtio_pci_device.read_config_register(0xa); + assert_eq!(reg10, 0); + + // | 16 bits | 16 bits | + // regiger 0xb: | Subsystem ID | Subsystem vendor ID| + // + // For us Subsystem ID is same as device ID and subsystem vendor ID is same as vendor ID + // (reg 0x0) + let reg11 = locked_virtio_pci_device.read_config_register(0xb); + assert_eq!(reg11, reg0); + + // register 0xc: Expansion ROM base address: 0x0 for us + let reg12 = locked_virtio_pci_device.read_config_register(0xc); + assert_eq!(reg12, 0); + + // | 24 bits | 8 bits | + // register 0xd: | Reserved | Capabilities pointer | + let reg13 = locked_virtio_pci_device.read_config_register(0xd); + assert_eq!(reg13 >> 24, 0); + + // register 0xe: Reserved + let reg14 = locked_virtio_pci_device.read_config_register(0xe); + assert_eq!(reg14, 0); + + // | 8 bits | 8 bits | 8 bits | 8 bits | + // register 0xf: | max latency | min grant | Interrupt pin | Interrupt line | + // + // We don't specify any of those + let reg15 = locked_virtio_pci_device.read_config_register(0xf); + assert_eq!(reg15, 0); + } + + #[test] + fn test_reading_bars() { + let mut vmm = default_vmm(); + vmm.device_manager.enable_pci(); + let entropy = Arc::new(Mutex::new(Entropy::new(RateLimiter::default()).unwrap())); + vmm.device_manager + .attach_virtio_device( + &vmm.vm, + "rng".to_string(), + entropy.clone(), + &mut Cmdline::new(1024).unwrap(), + false, + ) + .unwrap(); + + let device = vmm + .device_manager + .pci_devices + .get_virtio_device(entropy.lock().unwrap().device_type(), "rng") + .unwrap(); + + let mut locked_virtio_pci_device = device.lock().unwrap(); + + // According to OSdev wiki (https://wiki.osdev.org/PCI#Configuration_Space): + // + // When you want to retrieve the actual base address of a BAR, be sure to mask the lower + // bits. For 16-bit Memory Space BARs, you calculate (BAR[x] & 0xFFF0). For 32-bit Memory + // Space BARs, you calculate (BAR[x] & 0xFFFFFFF0). For 64-bit Memory Space BARs, you + // calculate ((BAR[x] & 0xFFFFFFF0) + ((BAR[x + 1] & 0xFFFFFFFF) << 32)) For I/O Space + // BARs, you calculate (BAR[x] & 0xFFFFFFFC). + + // We are allocating a single 64-bit MMIO bar for VirtIO capabilities list. As a result, we + // are using the first two BAR registers from the configuration space. + // + // The BAR address layout is as follows: + // + // | Bits 31-4 | Bit 3 | Bits 2-1 | Bit 0 | + // | 16-Byte Aligned Base Address | Prefetchable | Type | Always 0 | + // + // For 64-bit addresses though a second BAR is used to hold the upper 32 bits + // of the address. Prefetchable and type will be help in the lower bits of the + // first bar along with the lower 32-bits of the address which is always 16-bytes + // aligned. + let bar_addr_lo = locked_virtio_pci_device.read_config_register(0x4); + let bar_addr_hi = locked_virtio_pci_device.read_config_register(0x5); + let bar_addr = bar_addr_lo as u64 + ((bar_addr_hi as u64) << 32); + + // Bit 0 always 0 + assert_eq!(bar_addr & 0x1, 0); + // Type is 0x2 meaning 64-bit BAR + assert_eq!((bar_addr & 0x6) >> 1, 2); + // The actual address of the BAR should be the first available address of our 64-bit MMIO + // region + assert_eq!(bar_addr & 0xffff_ffff_ffff_fff0, MEM_64BIT_DEVICES_START); + + // Reading the BAR size is a bit more convoluted. According to OSDev wiki: + // + // To determine the amount of address space needed by a PCI device, you must save the + // original value of the BAR, write a value of all 1's to the register, then read it back. + // The amount of memory can then be determined by masking the information bits, performing + // a bitwise NOT ('~' in C), and incrementing the value by 1. + + locked_virtio_pci_device.write_config_register(0x4, 0, &[0xff, 0xff, 0xff, 0xff]); + // Read the lower size bits and mask out the last 4 bits include Prefetchable, Type and + // hardwired-0 + let bar_size_lo = locked_virtio_pci_device.read_config_register(0x4) as u64 & 0xfffffff0; + locked_virtio_pci_device.write_config_register(0x5, 0, &[0xff, 0xff, 0xff, 0xff]); + let bar_size_hi = locked_virtio_pci_device.read_config_register(0x5) as u64; + let bar_size = !((bar_size_hi << 32) | bar_size_lo) + 1; + + // We create a capabilities BAR region of 0x80000 bytes + assert_eq!(bar_size, 0x80000); + } +} diff --git a/src/vmm/src/devices/virtio/transport/pci/mod.rs b/src/vmm/src/devices/virtio/transport/pci/mod.rs new file mode 100644 index 00000000000..520b52274b3 --- /dev/null +++ b/src/vmm/src/devices/virtio/transport/pci/mod.rs @@ -0,0 +1,5 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +pub mod common_config; +pub mod device; From 4d0492852246cd374de7744b8669015542f21e63 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 4 Jun 2025 10:29:18 +0200 Subject: [PATCH 42/99] seccomp: allow new ioctls for vCPU threads We are now calling KVM_CHECK_EXTENSION for checking the KVM_CAP_MSI_DEVID capability. We are also calling KVM_SET_GSI_ROUTING to set the interrupts routes and KVM_IRQFD to set/unset interrupt lines. Signed-off-by: Babis Chalios --- .../seccomp/aarch64-unknown-linux-musl.json | 43 +++++++++++++++++++ .../seccomp/x86_64-unknown-linux-musl.json | 43 +++++++++++++++++++ 2 files changed, 86 insertions(+) diff --git a/resources/seccomp/aarch64-unknown-linux-musl.json b/resources/seccomp/aarch64-unknown-linux-musl.json index 8a3dac13673..433528b8f29 100644 --- a/resources/seccomp/aarch64-unknown-linux-musl.json +++ b/resources/seccomp/aarch64-unknown-linux-musl.json @@ -1020,6 +1020,49 @@ { "syscall": "restart_syscall", "comment": "automatically issued by the kernel when specific timing-related syscalls (e.g. nanosleep) get interrupted by SIGSTOP" + }, + { + "syscall": "ioctl", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 44547, + "comment": "KVM_CHECK_EXTENSION" + }, + { + "index": 2, + "type": "dword", + "op": "eq", + "val": 131, + "comment": "KVM_CAP_MSI_DEVID" + } + ] + }, + { + "syscall": "ioctl", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 1074310762, + "comment": "KVM_SET_GSI_ROUTING" + } + ] + }, + { + "syscall": "ioctl", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 1075883638, + "comment": "KVM_IRQFD" + } + ] } ] } diff --git a/resources/seccomp/x86_64-unknown-linux-musl.json b/resources/seccomp/x86_64-unknown-linux-musl.json index c3462d2f86b..14f2a26bafd 100644 --- a/resources/seccomp/x86_64-unknown-linux-musl.json +++ b/resources/seccomp/x86_64-unknown-linux-musl.json @@ -1152,6 +1152,49 @@ { "syscall": "restart_syscall", "comment": "automatically issued by the kernel when specific timing-related syscalls (e.g. nanosleep) get interrupted by SIGSTOP" + }, + { + "syscall": "ioctl", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 44547, + "comment": "KVM_CHECK_EXTENSION" + }, + { + "index": 2, + "type": "dword", + "op": "eq", + "val": 131, + "comment": "KVM_CAP_MSI_DEVID" + } + ] + }, + { + "syscall": "ioctl", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 1074310762, + "comment": "KVM_SET_GSI_ROUTING" + } + ] + }, + { + "syscall": "ioctl", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 1075883638, + "comment": "KVM_IRQFD" + } + ] } ] } From 9f63e3e421621b8b0a6333cbc72f0fbe7565296c Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 4 Jun 2025 15:07:11 +0200 Subject: [PATCH 43/99] pci: add unit tests to PciSegment Add some unit tests to PciSegment. We now test that the next_device_bdf() method and the initialization logic work as expected. We also check that the configuration space of the PCI segment is correctly registered with the MMIO and, on x86, PIO bus. Signed-off-by: Babis Chalios --- src/vmm/src/devices/pci/pci_segment.rs | 97 ++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/src/vmm/src/devices/pci/pci_segment.rs b/src/vmm/src/devices/pci/pci_segment.rs index 169ffdcba3b..c1e8bb07cb8 100644 --- a/src/vmm/src/devices/pci/pci_segment.rs +++ b/src/vmm/src/devices/pci/pci_segment.rs @@ -462,3 +462,100 @@ impl Aml for PciSegment { .append_aml_bytes(v) } } + +#[cfg(test)] +mod tests { + + use super::*; + use crate::arch; + use crate::utils::u64_to_usize; + + #[test] + fn test_pci_segment_build() { + let resource_allocator = Arc::new(ResourceAllocator::new().unwrap()); + let pci_irq_slots = &[0u8; 32]; + let pci_segment = PciSegment::new(0, &resource_allocator, pci_irq_slots).unwrap(); + + assert_eq!(pci_segment.id, 0); + assert_eq!( + pci_segment.start_of_mem32_area, + arch::MEM_32BIT_DEVICES_START + ); + assert_eq!( + pci_segment.end_of_mem32_area, + arch::MEM_32BIT_DEVICES_START + arch::MEM_32BIT_DEVICES_SIZE - 1 + ); + assert_eq!( + pci_segment.start_of_mem64_area, + arch::MEM_64BIT_DEVICES_START + ); + assert_eq!( + pci_segment.end_of_mem64_area, + arch::MEM_64BIT_DEVICES_START + arch::MEM_64BIT_DEVICES_SIZE - 1 + ); + assert_eq!(pci_segment.mmio_config_address, arch::PCI_MMCONFIG_START); + assert_eq!(pci_segment.proximity_domain, 0); + assert_eq!(pci_segment.pci_devices_up, 0); + assert_eq!(pci_segment.pci_devices_down, 0); + assert_eq!(pci_segment.pci_irq_slots, [0u8; 32]); + } + + #[cfg(target_arch = "x86_64")] + #[test] + fn test_io_bus() { + let resource_allocator = Arc::new(ResourceAllocator::new().unwrap()); + let pci_irq_slots = &[0u8; 32]; + let pci_segment = PciSegment::new(0, &resource_allocator, pci_irq_slots).unwrap(); + + let mut data = [0u8; u64_to_usize(PCI_CONFIG_IO_PORT_SIZE)]; + resource_allocator + .pio_bus + .read(PCI_CONFIG_IO_PORT, &mut data) + .unwrap(); + + resource_allocator + .pio_bus + .read(PCI_CONFIG_IO_PORT + PCI_CONFIG_IO_PORT_SIZE, &mut data) + .unwrap_err(); + } + + #[test] + fn test_mmio_bus() { + let resource_allocator = Arc::new(ResourceAllocator::new().unwrap()); + let pci_irq_slots = &[0u8; 32]; + let pci_segment = PciSegment::new(0, &resource_allocator, pci_irq_slots).unwrap(); + + let mut data = [0u8; u64_to_usize(PCI_MMIO_CONFIG_SIZE_PER_SEGMENT)]; + + resource_allocator + .mmio_bus + .read(pci_segment.mmio_config_address, &mut data) + .unwrap(); + resource_allocator + .mmio_bus + .read( + pci_segment.mmio_config_address + PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, + &mut data, + ) + .unwrap_err(); + } + + #[test] + fn test_next_device_bdf() { + let resource_allocator = Arc::new(ResourceAllocator::new().unwrap()); + let pci_irq_slots = &[0u8; 32]; + let pci_segment = PciSegment::new(0, &resource_allocator, pci_irq_slots).unwrap(); + + // Start checking from device id 1, since 0 is allocated to the Root port. + for dev_id in 1..32 { + let bdf = pci_segment.next_device_bdf().unwrap(); + // In our case we have a single Segment with id 0, which has + // a single bus with id 0. Also, each device of ours has a + // single function. + assert_eq!(bdf, PciBdf::new(0, 0, dev_id, 0)); + } + + // We can only have 32 devices on a segment + pci_segment.next_device_bdf().unwrap_err(); + } +} From 85e5a33c0da1e14fd6a46233fcc016a40c8f856a Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Thu, 5 Jun 2025 15:34:45 +0200 Subject: [PATCH 44/99] device_manager: save resource allocator in snapshot vm-allocator now allows us to (De)serialize IdAllocator and AddressAllocator types. Add ResourceAllocator in DeviceManager snapshot state and restore it when loading a snapshot. Like this we can avoid doing the ExactMatch allocations during snapshot resumes for reserving the exact same MMIO ranges. Moreover, change DeviceManager and PciDevices to provide save/restore functionality via the Persist trait. Like that we can avoid first creating the objects and then restoring their state, overwriting their fields. Signed-off-by: Babis Chalios --- Cargo.lock | 1 + src/vmm/Cargo.toml | 2 +- src/vmm/src/builder.rs | 22 ++-- src/vmm/src/device_manager/mod.rs | 158 +++++++++++++++--------- src/vmm/src/device_manager/pci_mngr.rs | 49 +++++--- src/vmm/src/device_manager/persist.rs | 38 ++---- src/vmm/src/device_manager/resources.rs | 130 ++++++++++++++++++- src/vmm/src/devices/acpi/vmgenid.rs | 5 - src/vmm/src/lib.rs | 1 + src/vmm/src/persist.rs | 1 + 10 files changed, 283 insertions(+), 124 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c22fd8d9d04..ecdfe8fefca 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1618,6 +1618,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "040a65b0c29f298d71ca45dd52d02b0d0ddc15b9b97d95dfeebe67d6fdd42a28" dependencies = [ "libc", + "serde", "thiserror 2.0.12", ] diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index eeb71fd7d32..d6a112a268a 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -52,7 +52,7 @@ userfaultfd = "0.8.1" utils = { path = "../utils" } uuid = "1.16.0" vhost = { version = "0.14.0", features = ["vhost-user-frontend"] } -vm-allocator = "0.1.3" +vm-allocator = { version = "0.1.3", features = ["serde"] } vm-device = { path = "../vm-device" } vm-memory = { version = "0.16.2", features = [ "backend-mmap", diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index b9b60ea8895..44776f8c102 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -42,6 +42,7 @@ use crate::logger::debug; use crate::persist::{MicrovmState, MicrovmStateError}; use crate::resources::VmResources; use crate::seccomp::BpfThreadMap; +use crate::snapshot::Persist; use crate::vmm_config::instance_info::InstanceInfo; use crate::vmm_config::machine_config::MachineConfigError; use crate::vstate::kvm::{Kvm, KvmError}; @@ -411,8 +412,6 @@ pub fn build_microvm_from_snapshot( .create_vcpus(vm_resources.machine_config.vcpu_count) .map_err(StartMicrovmError::Vm)?; - let mut device_manager = DeviceManager::new(event_manager, &vcpus_exit_evt, &vm).unwrap(); - vm.register_memory_regions(guest_memory) .map_err(StartMicrovmError::Vm)?; @@ -430,16 +429,6 @@ pub fn build_microvm_from_snapshot( } } - // Restore allocator state - #[cfg(target_arch = "aarch64")] - if let Some(pvtime_ipa) = vcpus[0].kvm_vcpu.pvtime_ipa { - allocate_pvtime_region( - &mut device_manager, - vcpus.len(), - vm_allocator::AllocPolicy::ExactMatch(pvtime_ipa.0), - )?; - } - // Restore vcpus kvm state. for (vcpu, state) in vcpus.iter_mut().zip(microvm_state.vcpu_states.iter()) { vcpu.kvm_vcpu @@ -463,6 +452,9 @@ pub fn build_microvm_from_snapshot( vm_resources.boot_source.config = microvm_state.vm_info.boot_source; // Restore devices states. + // Restoring VMGenID injects an interrupt in the guest to notify it about the new generation + // ID. As a result, we need to restore DeviceManager after restoring the KVM state, otherwise + // the injected interrupt will be overwritten. let device_ctor_args = DeviceRestoreArgs { mem: vm.guest_memory(), vm: &vm, @@ -470,9 +462,11 @@ pub fn build_microvm_from_snapshot( vm_resources, instance_id: &instance_info.id, restored_from_file: uffd.is_none(), + vcpus_exit_evt: &vcpus_exit_evt, }; - - device_manager.restore(µvm_state.device_states, device_ctor_args)?; + #[allow(unused_mut)] + let mut device_manager = + DeviceManager::restore(device_ctor_args, µvm_state.device_states)?; let mut vmm = Vmm { events_observer: Some(std::io::stdin()), diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index da61db922c3..e60d64394e8 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -15,7 +15,7 @@ use legacy::{LegacyDeviceError, PortIODeviceManager}; use linux_loader::loader::Cmdline; use log::error; use mmio::{MMIODeviceManager, MmioError}; -use pci_mngr::{PciDevices, PciManagerError}; +use pci_mngr::{PciDevices, PciDevicesConstructorArgs, PciManagerError}; use persist::{ACPIDeviceManagerConstructorArgs, MMIODevManagerConstructorArgs}; use resources::ResourceAllocator; use serde::{Deserialize, Serialize}; @@ -127,30 +127,39 @@ impl DeviceManager { Ok(serial) } + #[cfg(target_arch = "x86_64")] + fn create_legacy_devices( + event_manager: &mut EventManager, + vcpus_exit_evt: &EventFd, + vm: &Vm, + resource_allocator: &ResourceAllocator, + ) -> Result { + Self::set_stdout_nonblocking(); + + // Create serial device + let serial = Self::setup_serial_device(event_manager)?; + let reset_evt = vcpus_exit_evt + .try_clone() + .map_err(DeviceManagerCreateError::EventFd)?; + // Create keyboard emulator for reset event + let i8042 = Arc::new(Mutex::new(I8042Device::new(reset_evt)?)); + + // create pio dev manager with legacy devices + let mut legacy_devices = PortIODeviceManager::new(serial, i8042)?; + legacy_devices.register_devices(&resource_allocator.pio_bus, vm)?; + Ok(legacy_devices) + } + #[cfg_attr(target_arch = "aarch64", allow(unused))] pub fn new( event_manager: &mut EventManager, - vcpu_exit_evt: &EventFd, + vcpus_exit_evt: &EventFd, vm: &Vm, ) -> Result { let resource_allocator = Arc::new(ResourceAllocator::new()?); #[cfg(target_arch = "x86_64")] - let legacy_devices = { - Self::set_stdout_nonblocking(); - - // Create serial device - let serial = Self::setup_serial_device(event_manager)?; - let reset_evt = vcpu_exit_evt - .try_clone() - .map_err(DeviceManagerCreateError::EventFd)?; - // Create keyboard emulator for reset event - let i8042 = Arc::new(Mutex::new(I8042Device::new(reset_evt)?)); - - // create pio dev manager with legacy devices - let mut legacy_devices = PortIODeviceManager::new(serial, i8042)?; - legacy_devices.register_devices(&resource_allocator.pio_bus, vm)?; - legacy_devices - }; + let legacy_devices = + Self::create_legacy_devices(event_manager, vcpus_exit_evt, vm, &resource_allocator)?; Ok(DeviceManager { resource_allocator, @@ -270,6 +279,8 @@ impl DeviceManager { #[derive(Debug, Default, Clone, Serialize, Deserialize)] /// State of devices in the system pub struct DevicesState { + /// Resource allocator state + pub resource_allocator_state: resources::ResourceAllocatorState, /// MMIO devices state pub mmio_state: persist::DeviceStates, /// ACPI devices state @@ -292,12 +303,15 @@ pub enum DevicePersistError { SerialRestore(#[from] EmulateSerialInitError), /// Error inserting device in bus: {0} Bus(#[from] vm_device::BusError), + /// Error creating DeviceManager: {0} + DeviceManager(#[from] DeviceManagerCreateError), } pub struct DeviceRestoreArgs<'a> { pub mem: &'a GuestMemoryMmap, pub vm: &'a Vm, pub event_manager: &'a mut EventManager, + pub vcpus_exit_evt: &'a EventFd, pub vm_resources: &'a mut VmResources, pub instance_id: &'a str, pub restored_from_file: bool, @@ -315,15 +329,82 @@ impl std::fmt::Debug for DeviceRestoreArgs<'_> { } } -impl DeviceManager { - pub fn save(&self) -> DevicesState { +impl<'a> Persist<'a> for DeviceManager { + type State = DevicesState; + type ConstructorArgs = DeviceRestoreArgs<'a>; + type Error = DevicePersistError; + + fn save(&self) -> Self::State { DevicesState { + resource_allocator_state: self.resource_allocator.save(), mmio_state: self.mmio_devices.save(), acpi_state: self.acpi_devices.save(), pci_state: self.pci_devices.save(), } } + fn restore( + constructor_args: Self::ConstructorArgs, + state: &Self::State, + ) -> std::result::Result { + // Safe to unwrap here. ResourceAllocator restoring cannot fail. + let resource_allocator = + Arc::new(ResourceAllocator::restore((), &state.resource_allocator_state).unwrap()); + + // Setup legacy devices in case of x86 + #[cfg(target_arch = "x86_64")] + let legacy_devices = Self::create_legacy_devices( + constructor_args.event_manager, + constructor_args.vcpus_exit_evt, + constructor_args.vm, + &resource_allocator, + )?; + + // Restore MMIO devices + let mmio_ctor_args = MMIODevManagerConstructorArgs { + mem: constructor_args.mem, + vm: constructor_args.vm, + event_manager: constructor_args.event_manager, + resource_allocator: &resource_allocator, + vm_resources: constructor_args.vm_resources, + instance_id: constructor_args.instance_id, + restored_from_file: constructor_args.restored_from_file, + }; + let mmio_devices = MMIODeviceManager::restore(mmio_ctor_args, &state.mmio_state)?; + + // Restore ACPI devices + let acpi_ctor_args = ACPIDeviceManagerConstructorArgs { + mem: constructor_args.mem, + resource_allocator: &resource_allocator, + vm: constructor_args.vm, + }; + let mut acpi_devices = ACPIDeviceManager::restore(acpi_ctor_args, &state.acpi_state)?; + acpi_devices.notify_vmgenid()?; + + // Restore PCI devices + let pci_ctor_args = PciDevicesConstructorArgs { + resource_allocator: &resource_allocator, + }; + let pci_devices = PciDevices::restore(pci_ctor_args, &state.pci_state)?; + + let device_manager = DeviceManager { + resource_allocator, + mmio_devices, + #[cfg(target_arch = "x86_64")] + legacy_devices, + acpi_devices, + pci_devices, + }; + + // Restore serial. + // We need to do that after we restore mmio devices, otherwise it won't succeed in Aarch64 + device_manager.emulate_serial_init()?; + + Ok(device_manager) + } +} + +impl DeviceManager { /// Sets RDA bit in serial console pub fn emulate_serial_init(&self) -> Result<(), EmulateSerialInitError> { // When restoring from a previously saved state, there is no serial @@ -361,43 +442,6 @@ impl DeviceManager { Ok(()) } } - - pub fn restore( - &mut self, - state: &DevicesState, - restore_args: DeviceRestoreArgs, - ) -> Result<(), DevicePersistError> { - // Restore MMIO devices - let mmio_ctor_args = MMIODevManagerConstructorArgs { - mem: restore_args.mem, - vm: restore_args.vm, - event_manager: restore_args.event_manager, - resource_allocator: &self.resource_allocator, - vm_resources: restore_args.vm_resources, - instance_id: restore_args.instance_id, - restored_from_file: restore_args.restored_from_file, - }; - self.mmio_devices = MMIODeviceManager::restore(mmio_ctor_args, &state.mmio_state)?; - - // Restore serial. - // We need to do that after we restore mmio devices, otherwise it won't succeed in Aarch64 - self.emulate_serial_init()?; - - // Restore ACPI devices - let acpi_ctor_args = ACPIDeviceManagerConstructorArgs { - mem: restore_args.mem, - resource_allocator: &self.resource_allocator, - vm: restore_args.vm, - }; - self.acpi_devices = ACPIDeviceManager::restore(acpi_ctor_args, &state.acpi_state)?; - self.acpi_devices.notify_vmgenid()?; - - // Restore PCI devices - self.pci_devices - .restore(&state.pci_state, &self.resource_allocator)?; - - Ok(()) - } } #[cfg(test)] diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs index 686349858fb..70bb03388f6 100644 --- a/src/vmm/src/device_manager/pci_mngr.rs +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -16,6 +16,7 @@ use crate::device_manager::resources::ResourceAllocator; use crate::devices::pci::PciSegment; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::transport::pci::device::{VirtioPciDevice, VirtioPciDeviceError}; +use crate::snapshot::Persist; use crate::vstate::vm::InterruptError; #[derive(Debug, Default)] @@ -65,24 +66,6 @@ impl PciDevices { Ok(()) } - pub fn save(&self) -> PciDevicesState { - PciDevicesState { - pci_enabled: self.pci_segment.is_some(), - } - } - - pub fn restore( - &mut self, - state: &PciDevicesState, - resource_allocator: &Arc, - ) -> Result<(), PciManagerError> { - if state.pci_enabled { - self.attach_pci_segment(resource_allocator)?; - } - - Ok(()) - } - fn register_bars_with_bus( resource_allocator: &ResourceAllocator, virtio_device: &Arc>, @@ -194,3 +177,33 @@ impl PciDevices { pub struct PciDevicesState { pci_enabled: bool, } + +#[derive(Debug)] +pub struct PciDevicesConstructorArgs<'a> { + pub resource_allocator: &'a Arc, +} + +impl<'a> Persist<'a> for PciDevices { + type State = PciDevicesState; + type ConstructorArgs = PciDevicesConstructorArgs<'a>; + type Error = PciManagerError; + + fn save(&self) -> Self::State { + PciDevicesState { + pci_enabled: self.pci_segment.is_some(), + } + } + + fn restore( + constructor_args: Self::ConstructorArgs, + state: &Self::State, + ) -> std::result::Result { + let mut pci_devices = PciDevices::new(); + + if state.pci_enabled { + pci_devices.attach_pci_segment(constructor_args.resource_allocator)?; + } + + Ok(pci_devices) + } +} diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 7b9605b3e5d..620477ea88f 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -9,7 +9,6 @@ use std::sync::{Arc, Mutex}; use event_manager::{MutEventSubscriber, SubscriberOps}; use log::{error, warn}; use serde::{Deserialize, Serialize}; -use vm_allocator::AllocPolicy; use super::acpi::ACPIDeviceManager; use super::mmio::*; @@ -453,27 +452,6 @@ impl<'a> Persist<'a> for MMIODeviceManager { .map_err(|()| DevicePersistError::MmioTransport)?, )); - // We do not currently require exact re-allocation of IDs via - // `dev_manager.irq_allocator.allocate_id()` and currently cannot do - // this effectively as `IdAllocator` does not implement an exact - // match API. - // In the future we may require preserving `IdAllocator`'s state - // after snapshot restore so as to restore the exact interrupt IDs - // from the original device's state for implementing hot-plug. - // For now this is why we do not restore the state of the - // `IdAllocator` under `dev_manager`. - - constructor_args - .resource_allocator - .allocate_32bit_mmio_memory( - MMIO_LEN, - MMIO_LEN, - AllocPolicy::ExactMatch(device_info.addr), - ) - .map_err(|e| { - DevicePersistError::DeviceManager(super::mmio::MmioError::Allocator(e)) - })?; - dev_manager.register_mmio_virtio( vm, id.clone(), @@ -653,6 +631,7 @@ mod tests { use super::*; use crate::builder::tests::*; + use crate::device_manager; use crate::devices::virtio::block::CacheType; use crate::resources::VmmConfig; use crate::snapshot::Snapshot; @@ -723,11 +702,10 @@ mod tests { #[test] fn test_device_manager_persistence() { - let mut buf = vec![0; 16384]; + let mut buf = vec![0; 65536]; // These need to survive so the restored blocks find them. let _block_files; let mut tmp_sock_file = TempFile::new().unwrap(); - let resource_allocator = ResourceAllocator::new().unwrap(); tmp_sock_file.remove().unwrap(); // Set up a vmm with one of each device, and get the serialized DeviceStates. { @@ -787,7 +765,10 @@ mod tests { let mut event_manager = EventManager::new().expect("Unable to create EventManager"); let vmm = default_vmm(); - let device_states: DeviceStates = Snapshot::deserialize(&mut buf.as_slice()).unwrap(); + let device_manager_state: device_manager::DevicesState = + Snapshot::deserialize(&mut buf.as_slice()).unwrap(); + let resource_allocator = + ResourceAllocator::restore((), &device_manager_state.resource_allocator_state).unwrap(); let vm_resources = &mut VmResources::default(); let restore_args = MMIODevManagerConstructorArgs { mem: vmm.vm.guest_memory(), @@ -799,7 +780,7 @@ mod tests { restored_from_file: true, }; let _restored_dev_manager = - MMIODeviceManager::restore(restore_args, &device_states).unwrap(); + MMIODeviceManager::restore(restore_args, &device_manager_state.mmio_state).unwrap(); let expected_vm_resources = format!( r#"{{ @@ -875,7 +856,10 @@ mod tests { .version(), MmdsVersion::V2 ); - assert_eq!(device_states.mmds.unwrap().version, MmdsVersion::V2); + assert_eq!( + device_manager_state.mmio_state.mmds.unwrap().version, + MmdsVersion::V2 + ); assert_eq!( expected_vm_resources, serde_json::to_string_pretty(&VmmConfig::from(&*vm_resources)).unwrap() diff --git a/src/vmm/src/device_manager/resources.rs b/src/vmm/src/device_manager/resources.rs index 2a93c7fd17f..2481d13b37e 100644 --- a/src/vmm/src/device_manager/resources.rs +++ b/src/vmm/src/device_manager/resources.rs @@ -1,14 +1,17 @@ // Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +use std::convert::Infallible; use std::sync::{Arc, Mutex}; use pci::DeviceRelocation; +use serde::{Deserialize, Serialize}; pub use vm_allocator::AllocPolicy; use vm_allocator::{AddressAllocator, IdAllocator}; use vm_device::Bus; use crate::arch; +use crate::snapshot::Persist; /// A resource manager for (de)allocating interrupt lines (GSIs) and guest memory /// @@ -152,6 +155,69 @@ impl ResourceAllocator { } } +impl<'a> Persist<'a> for ResourceAllocator { + type State = ResourceAllocatorState; + type ConstructorArgs = (); + type Error = Infallible; + + fn save(&self) -> Self::State { + ResourceAllocatorState { + gsi_allocator: self.gsi_allocator.clone(), + mmio32_memory: self.mmio32_memory.clone(), + mmio64_memory: self.mmio64_memory.clone(), + system_memory: self.system_memory.clone(), + } + } + + fn restore( + _constructor_args: Self::ConstructorArgs, + state: &Self::State, + ) -> std::result::Result { + Ok(ResourceAllocator { + gsi_allocator: state.gsi_allocator.clone(), + mmio32_memory: state.mmio32_memory.clone(), + mmio64_memory: state.mmio64_memory.clone(), + system_memory: state.system_memory.clone(), + mmio_bus: Arc::new(Bus::new()), + #[cfg(target_arch = "x86_64")] + pio_bus: Arc::new(Bus::new()), + }) + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ResourceAllocatorState { + // Allocator for device interrupt lines + pub gsi_allocator: Arc>, + // Allocator for memory in the 32-bit MMIO address space + pub mmio32_memory: Arc>, + // Allocator for memory in the 64-bit MMIO address space + pub mmio64_memory: Arc>, + // Memory allocator for system data + pub system_memory: Arc>, +} + +impl Default for ResourceAllocatorState { + fn default() -> Self { + Self { + gsi_allocator: Arc::new(Mutex::new( + IdAllocator::new(arch::IRQ_BASE, arch::IRQ_MAX).unwrap(), + )), + mmio32_memory: Arc::new(Mutex::new( + AddressAllocator::new(arch::MEM_32BIT_DEVICES_START, arch::MEM_32BIT_DEVICES_SIZE) + .unwrap(), + )), + mmio64_memory: Arc::new(Mutex::new( + AddressAllocator::new(arch::MEM_64BIT_DEVICES_START, arch::MEM_64BIT_DEVICES_SIZE) + .unwrap(), + )), + system_memory: Arc::new(Mutex::new( + AddressAllocator::new(arch::SYSTEM_MEM_START, arch::SYSTEM_MEM_SIZE).unwrap(), + )), + } + } +} + impl DeviceRelocation for ResourceAllocator { fn move_bar( &self, @@ -167,8 +233,11 @@ impl DeviceRelocation for ResourceAllocator { #[cfg(test)] mod tests { - use super::ResourceAllocator; - use crate::arch; + use vm_allocator::AllocPolicy; + + use super::{ResourceAllocator, ResourceAllocatorState}; + use crate::arch::{self, GSI_BASE}; + use crate::snapshot::{Persist, Snapshot}; const MAX_IRQS: u32 = arch::GSI_MAX - arch::GSI_BASE + 1; @@ -210,4 +279,61 @@ mod tests { assert_eq!(allocator.allocate_gsi(1), Ok(vec![i])); } } + + fn clone_allocator(allocator: &ResourceAllocator) -> ResourceAllocator { + let mut buf = vec![0u8; 1024]; + Snapshot::serialize(&mut buf.as_mut_slice(), &allocator.save()).unwrap(); + let restored_state: ResourceAllocatorState = + Snapshot::deserialize(&mut buf.as_slice()).unwrap(); + ResourceAllocator::restore((), &restored_state).unwrap() + } + + #[test] + fn test_save_restore() { + let allocator0 = ResourceAllocator::new().unwrap(); + let gsi_0 = allocator0.allocate_gsi(1).unwrap()[0]; + assert_eq!(gsi_0, GSI_BASE); + + let allocator1 = clone_allocator(&allocator0); + let gsi_1 = allocator1.allocate_gsi(1).unwrap()[0]; + assert_eq!(gsi_1, GSI_BASE + 1); + let mmio32_mem = allocator1 + .allocate_32bit_mmio_memory(0x42, 1, AllocPolicy::FirstMatch) + .unwrap(); + assert_eq!(mmio32_mem, arch::MEM_32BIT_DEVICES_START); + let mmio64_mem = allocator1 + .allocate_64bit_mmio_memory(0x42, 1, AllocPolicy::FirstMatch) + .unwrap(); + assert_eq!(mmio64_mem, arch::MEM_64BIT_DEVICES_START); + let system_mem = allocator1 + .allocate_system_memory(0x42, 1, AllocPolicy::FirstMatch) + .unwrap(); + assert_eq!(system_mem, arch::SYSTEM_MEM_START); + + let allocator2 = clone_allocator(&allocator1); + allocator2 + .allocate_32bit_mmio_memory(0x42, 1, AllocPolicy::ExactMatch(mmio32_mem)) + .unwrap_err(); + allocator2 + .allocate_64bit_mmio_memory(0x42, 1, AllocPolicy::ExactMatch(mmio64_mem)) + .unwrap_err(); + allocator2 + .allocate_system_memory(0x42, 1, AllocPolicy::ExactMatch(system_mem)) + .unwrap_err(); + + let gsi_2 = allocator2.allocate_gsi(1).unwrap()[0]; + assert_eq!(gsi_2, GSI_BASE + 2); + let mmio32_mem = allocator1 + .allocate_32bit_mmio_memory(0x42, 1, AllocPolicy::FirstMatch) + .unwrap(); + assert_eq!(mmio32_mem, arch::MEM_32BIT_DEVICES_START + 0x42); + let mmio64_mem = allocator1 + .allocate_64bit_mmio_memory(0x42, 1, AllocPolicy::FirstMatch) + .unwrap(); + assert_eq!(mmio64_mem, arch::MEM_64BIT_DEVICES_START + 0x42); + let system_mem = allocator1 + .allocate_system_memory(0x42, 1, AllocPolicy::FirstMatch) + .unwrap(); + assert_eq!(system_mem, arch::SYSTEM_MEM_START + 0x42); + } } diff --git a/src/vmm/src/devices/acpi/vmgenid.rs b/src/vmm/src/devices/acpi/vmgenid.rs index df0656bfbcc..0cf0ae0d7b1 100644 --- a/src/vmm/src/devices/acpi/vmgenid.rs +++ b/src/vmm/src/devices/acpi/vmgenid.rs @@ -152,11 +152,6 @@ impl<'a> Persist<'a> for VmGenId { constructor_args: Self::ConstructorArgs, state: &Self::State, ) -> std::result::Result { - constructor_args.resource_allocator.allocate_system_memory( - VMGENID_MEM_SIZE, - 8, - vm_allocator::AllocPolicy::ExactMatch(state.addr), - )?; Self::from_parts(GuestAddress(state.addr), state.gsi, constructor_args.mem) } } diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index de99a4593af..f842e44c6a5 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -126,6 +126,7 @@ use devices::acpi::vmgenid::VmGenIdError; use devices::virtio::device::VirtioDevice; use event_manager::{EventManager as BaseEventManager, EventOps, Events, MutEventSubscriber}; use seccomp::BpfProgram; +use snapshot::Persist; use userfaultfd::Uffd; use vmm_sys_util::epoll::EventSet; use vmm_sys_util::eventfd::EventFd; diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index f8b02a36876..54804717a19 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -599,6 +599,7 @@ mod tests { #[cfg(target_arch = "aarch64")] use crate::construct_kvm_mpidrs; use crate::devices::virtio::block::CacheType; + use crate::snapshot::Persist; use crate::vmm_config::balloon::BalloonDeviceConfig; use crate::vmm_config::net::NetworkInterfaceConfig; use crate::vmm_config::vsock::tests::default_config; From e53ffd8dd3d5d444b56ade15c120b78fe9cac822 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Fri, 6 Jun 2025 13:36:38 +0200 Subject: [PATCH 45/99] refactor: VirtIO MMIO persistence logic VirtIO MMIO restore logic activates the device the moment we restore the device state, if the device was activated when snapshotted. Move the activation responsibility to the logic the restores the MMIO transport. The reason for this change is that that's how it will be done for the PCI transport. Unifying this will allow us reusing the same types for restoring the non-transport state of devices. Note that we needed to change the way Net devices are saved/restored. RxBuffer type of Net devices holds RX descriptors that we have parsed from the Queue ahead of time. The way we restored this info was manipulating the queue to re-parse the RX descriptors during the restore phase. However, we need the device to be activated to do so, which now isn't. So, instead of storing this info inside the snapshot make sure we have flushed everything before taking the snapshot. Also, simplify a bit the types that we use for serializing/deserializing the state of a device. Signed-off-by: Babis Chalios --- src/vmm/src/device_manager/mmio.rs | 80 -------- src/vmm/src/device_manager/mod.rs | 109 +++++++++- src/vmm/src/device_manager/persist.rs | 191 ++++++------------ src/vmm/src/devices/virtio/balloon/persist.rs | 36 ++-- src/vmm/src/devices/virtio/block/persist.rs | 10 +- .../devices/virtio/block/virtio/persist.rs | 18 +- src/vmm/src/devices/virtio/net/device.rs | 21 ++ src/vmm/src/devices/virtio/net/persist.rs | 48 +---- src/vmm/src/devices/virtio/rng/persist.rs | 16 +- src/vmm/src/devices/virtio/vsock/persist.rs | 14 +- src/vmm/src/lib.rs | 2 +- src/vmm/src/persist.rs | 15 +- 12 files changed, 224 insertions(+), 336 deletions(-) diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 8655247fde7..2a4b0161dab 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -15,7 +15,6 @@ use kvm_ioctls::IoEventAddress; use linux_loader::cmdline as kernel_cmdline; #[cfg(target_arch = "x86_64")] use log::debug; -use log::info; use serde::{Deserialize, Serialize}; use vm_allocator::AllocPolicy; @@ -27,14 +26,8 @@ use crate::arch::{RTC_MEM_START, SERIAL_MEM_START}; #[cfg(target_arch = "aarch64")] use crate::devices::legacy::{RTCDevice, SerialDevice}; use crate::devices::pseudo::BootTimer; -use crate::devices::virtio::balloon::Balloon; -use crate::devices::virtio::block::device::Block; use crate::devices::virtio::device::VirtioDevice; -use crate::devices::virtio::net::Net; -use crate::devices::virtio::rng::Entropy; use crate::devices::virtio::transport::mmio::MmioTransport; -use crate::devices::virtio::vsock::{TYPE_VSOCK, Vsock, VsockUnixBackend}; -use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET, TYPE_RNG}; #[cfg(target_arch = "x86_64")] use crate::vstate::memory::GuestAddress; @@ -441,79 +434,6 @@ impl MMIODeviceManager { Ok(()) } - /// Artificially kick devices as if they had external events. - pub fn kick_devices(&self) { - info!("Artificially kick devices."); - // We only kick virtio devices for now. - let _: Result<(), MmioError> = self.for_each_virtio_device(|virtio_type, id, device| { - let mmio_transport_locked = device.inner.lock().expect("Poisoned locked"); - let mut virtio = mmio_transport_locked.locked_device(); - match *virtio_type { - TYPE_BALLOON => { - let balloon = virtio.as_mut_any().downcast_mut::().unwrap(); - // If device is activated, kick the balloon queue(s) to make up for any - // pending or in-flight epoll events we may have not captured in snapshot. - // Stats queue doesn't need kicking as it is notified via a `timer_fd`. - if balloon.is_activated() { - info!("kick balloon {}.", id); - balloon.process_virtio_queues().unwrap(); - } - } - TYPE_BLOCK => { - // We only care about kicking virtio block. - // If we need to kick vhost-user-block we can do nothing. - if let Some(block) = virtio.as_mut_any().downcast_mut::() { - // If device is activated, kick the block queue(s) to make up for any - // pending or in-flight epoll events we may have not captured in - // snapshot. No need to kick Ratelimiters - // because they are restored 'unblocked' so - // any inflight `timer_fd` events can be safely discarded. - if block.is_activated() { - info!("kick block {}.", id); - block.process_virtio_queues().unwrap(); - } - } - } - TYPE_NET => { - let net = virtio.as_mut_any().downcast_mut::().unwrap(); - // If device is activated, kick the net queue(s) to make up for any - // pending or in-flight epoll events we may have not captured in snapshot. - // No need to kick Ratelimiters because they are restored 'unblocked' so - // any inflight `timer_fd` events can be safely discarded. - if net.is_activated() { - info!("kick net {}.", id); - net.process_virtio_queues().unwrap(); - } - } - TYPE_VSOCK => { - // Vsock has complicated protocol that isn't resilient to any packet loss, - // so for Vsock we don't support connection persistence through snapshot. - // Any in-flight packets or events are simply lost. - // Vsock is restored 'empty'. - // The only reason we still `kick` it is to make guest process - // `TRANSPORT_RESET_EVENT` event we sent during snapshot creation. - let vsock = virtio - .as_mut_any() - .downcast_mut::>() - .unwrap(); - if vsock.is_activated() { - info!("kick vsock {id}."); - vsock.signal_used_queue(0).unwrap(); - } - } - TYPE_RNG => { - let entropy = virtio.as_mut_any().downcast_mut::().unwrap(); - if entropy.is_activated() { - info!("kick entropy {id}."); - entropy.process_virtio_queues().unwrap(); - } - } - _ => (), - } - Ok(()) - }); - } - #[cfg(target_arch = "aarch64")] pub fn virtio_device_info(&self) -> Vec<&MMIODeviceInfo> { let mut device_info = Vec::new(); diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index e60d64394e8..95e04111b13 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -5,6 +5,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the THIRD-PARTY file. +use std::convert::Infallible; use std::fmt::Debug; use std::sync::{Arc, Mutex}; @@ -13,7 +14,7 @@ use event_manager::{MutEventSubscriber, SubscriberOps}; #[cfg(target_arch = "x86_64")] use legacy::{LegacyDeviceError, PortIODeviceManager}; use linux_loader::loader::Cmdline; -use log::error; +use log::{error, info}; use mmio::{MMIODeviceManager, MmioError}; use pci_mngr::{PciDevices, PciDevicesConstructorArgs, PciManagerError}; use persist::{ACPIDeviceManagerConstructorArgs, MMIODevManagerConstructorArgs}; @@ -30,8 +31,14 @@ use crate::devices::legacy::RTCDevice; use crate::devices::legacy::serial::SerialOut; use crate::devices::legacy::{IER_RDA_BIT, IER_RDA_OFFSET, SerialDevice}; use crate::devices::pseudo::BootTimer; +use crate::devices::virtio::balloon::Balloon; +use crate::devices::virtio::block::device::Block; use crate::devices::virtio::device::VirtioDevice; +use crate::devices::virtio::net::Net; +use crate::devices::virtio::rng::Entropy; use crate::devices::virtio::transport::mmio::{IrqTrigger, MmioTransport}; +use crate::devices::virtio::vsock::{TYPE_VSOCK, Vsock, VsockUnixBackend}; +use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET, TYPE_RNG}; use crate::resources::VmResources; use crate::snapshot::Persist; use crate::vstate::memory::GuestMemoryMmap; @@ -274,6 +281,106 @@ impl DeviceManager { self.pci_devices .attach_pci_segment(&self.resource_allocator) } + + fn do_kick_device(virtio_device: Arc>) { + let mut device = virtio_device.lock().expect("Poisoned lock"); + match device.device_type() { + TYPE_BALLOON => { + let balloon = device.as_mut_any().downcast_mut::().unwrap(); + // If device is activated, kick the balloon queue(s) to make up for any + // pending or in-flight epoll events we may have not captured in snapshot. + // Stats queue doesn't need kicking as it is notified via a `timer_fd`. + if balloon.is_activated() { + info!("kick balloon {}.", balloon.id()); + balloon.process_virtio_queues().unwrap(); + } + } + TYPE_BLOCK => { + // We only care about kicking virtio block. + // If we need to kick vhost-user-block we can do nothing. + if let Some(block) = device.as_mut_any().downcast_mut::() { + // If device is activated, kick the block queue(s) to make up for any + // pending or in-flight epoll events we may have not captured in + // snapshot. No need to kick Ratelimiters + // because they are restored 'unblocked' so + // any inflight `timer_fd` events can be safely discarded. + if block.is_activated() { + info!("kick block {}.", block.id()); + block.process_virtio_queues().unwrap(); + } + } + } + TYPE_NET => { + let net = device.as_mut_any().downcast_mut::().unwrap(); + // If device is activated, kick the net queue(s) to make up for any + // pending or in-flight epoll events we may have not captured in snapshot. + // No need to kick Ratelimiters because they are restored 'unblocked' so + // any inflight `timer_fd` events can be safely discarded. + if net.is_activated() { + info!("kick net {}.", net.id()); + net.process_virtio_queues().unwrap(); + } + } + TYPE_VSOCK => { + // Vsock has complicated protocol that isn't resilient to any packet loss, + // so for Vsock we don't support connection persistence through snapshot. + // Any in-flight packets or events are simply lost. + // Vsock is restored 'empty'. + // The only reason we still `kick` it is to make guest process + // `TRANSPORT_RESET_EVENT` event we sent during snapshot creation. + let vsock = device + .as_mut_any() + .downcast_mut::>() + .unwrap(); + if vsock.is_activated() { + info!("kick vsock {}.", vsock.id()); + vsock.signal_used_queue(0).unwrap(); + } + } + TYPE_RNG => { + let entropy = device.as_mut_any().downcast_mut::().unwrap(); + if entropy.is_activated() { + info!("kick entropy {}.", entropy.id()); + entropy.process_virtio_queues().unwrap(); + } + } + _ => (), + } + } + + /// Artificially kick VirtIO devices as if they had external events. + pub fn kick_virtio_devices(&self) { + info!("Artificially kick devices"); + // Go through MMIO VirtIO devices + let _: Result<(), MmioError> = self.mmio_devices.for_each_virtio_device(|_, _, device| { + let mmio_transport_locked = device.inner.lock().expect("Poisoned lock"); + Self::do_kick_device(mmio_transport_locked.device()); + Ok(()) + }); + } + + fn do_mark_virtio_queue_memory_dirty( + device: Arc>, + mem: &GuestMemoryMmap, + ) { + // SAFETY: + // This should never fail as we mark pages only if device has already been activated, + // and the address validation was already performed on device activation. + let mut locked_device = device.lock().expect("Poisoned lock"); + if locked_device.is_activated() { + locked_device.mark_queue_memory_dirty(mem).unwrap() + } + } + + /// Mark queue memory dirty for activated VirtIO devices + pub fn mark_virtio_queue_memory_dirty(&self, mem: &GuestMemoryMmap) { + // Go through MMIO VirtIO devices + let _: Result<(), Infallible> = self.mmio_devices.for_each_virtio_device(|_, _, device| { + let mmio_transport_locked = device.inner.lock().expect("Poisoned locked"); + Self::do_mark_virtio_queue_memory_dirty(mmio_transport_locked.device(), mem); + Ok(()) + }); + } } #[derive(Debug, Default, Clone, Serialize, Deserialize)] diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 620477ea88f..2b3e2bcd815 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -42,7 +42,7 @@ use crate::devices::virtio::vsock::persist::{ use crate::devices::virtio::vsock::{ TYPE_VSOCK, Vsock, VsockError, VsockUnixBackend, VsockUnixBackendError, }; -use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET, TYPE_RNG}; +use crate::devices::virtio::{ActivateError, TYPE_BALLOON, TYPE_BLOCK, TYPE_NET, TYPE_RNG}; use crate::mmds::data_store::MmdsVersion; use crate::resources::{ResourcesError, VmResources}; use crate::snapshot::Persist; @@ -78,67 +78,17 @@ pub enum DevicePersistError { Entropy(#[from] EntropyError), /// Resource misconfiguration: {0}. Is the snapshot file corrupted? ResourcesError(#[from] ResourcesError), + /// Could not activate device: {0} + DeviceActivation(#[from] ActivateError), } -/// Holds the state of a balloon device connected to the MMIO space. +/// Holds the state of a MMIO VirtIO device #[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ConnectedBalloonState { +pub struct VirtioDeviceState { /// Device identifier. pub device_id: String, /// Device state. - pub device_state: BalloonState, - /// Mmio transport state. - pub transport_state: MmioTransportState, - /// VmmResources. - pub device_info: MMIODeviceInfo, -} - -/// Holds the state of a virtio block device connected to the MMIO space. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ConnectedBlockState { - /// Device identifier. - pub device_id: String, - /// Device state. - pub device_state: BlockState, - /// Mmio transport state. - pub transport_state: MmioTransportState, - /// VmmResources. - pub device_info: MMIODeviceInfo, -} - -/// Holds the state of a net device connected to the MMIO space. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ConnectedNetState { - /// Device identifier. - pub device_id: String, - /// Device state. - pub device_state: NetState, - /// Mmio transport state. - pub transport_state: MmioTransportState, - /// VmmResources. - pub device_info: MMIODeviceInfo, -} - -/// Holds the state of a vsock device connected to the MMIO space. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ConnectedVsockState { - /// Device identifier. - pub device_id: String, - /// Device state. - pub device_state: VsockState, - /// Mmio transport state. - pub transport_state: MmioTransportState, - /// VmmResources. - pub device_info: MMIODeviceInfo, -} - -/// Holds the state of an entropy device connected to the MMIO space. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ConnectedEntropyState { - /// Device identifier. - pub device_id: String, - /// Device state. - pub device_state: EntropyState, + pub device_state: T, /// Mmio transport state. pub transport_state: MmioTransportState, /// VmmResources. @@ -168,17 +118,17 @@ pub struct DeviceStates { // State of legacy devices in MMIO space. pub legacy_devices: Vec, /// Block device states. - pub block_devices: Vec, + pub block_devices: Vec>, /// Net device states. - pub net_devices: Vec, + pub net_devices: Vec>, /// Vsock device state. - pub vsock_device: Option, + pub vsock_device: Option>, /// Balloon device state. - pub balloon_device: Option, + pub balloon_device: Option>, /// Mmds version. pub mmds: Option, /// Entropy device state. - pub entropy_device: Option, + pub entropy_device: Option>, } /// A type used to extract the concrete `Arc>` for each of the device @@ -292,20 +242,22 @@ impl<'a> Persist<'a> for MMIODeviceManager { let _: Result<(), ()> = self.for_each_virtio_device(|_, devid, device| { let mmio_transport_locked = device.inner.lock().expect("Poisoned lock"); let transport_state = mmio_transport_locked.save(); + let device_info = device.resources; + let device_id = devid.clone(); let mut locked_device = mmio_transport_locked.locked_device(); match locked_device.device_type() { TYPE_BALLOON => { - let balloon_state = locked_device + let device_state = locked_device .as_any() .downcast_ref::() .unwrap() .save(); - states.balloon_device = Some(ConnectedBalloonState { - device_id: devid.clone(), - device_state: balloon_state, + states.balloon_device = Some(VirtioDeviceState { + device_id, + device_state, transport_state, - device_info: device.resources, + device_info, }); } // Both virtio-block and vhost-user-block share same device type. @@ -318,16 +270,17 @@ impl<'a> Persist<'a> for MMIODeviceManager { ); } else { block.prepare_save(); - states.block_devices.push(ConnectedBlockState { - device_id: devid.clone(), - device_state: block.save(), + let device_state = block.save(); + states.block_devices.push(VirtioDeviceState { + device_id, + device_state, transport_state, - device_info: device.resources, - }) + device_info, + }); } } TYPE_NET => { - let net = locked_device.as_any().downcast_ref::().unwrap(); + let net = locked_device.as_mut_any().downcast_mut::().unwrap(); if let (Some(mmds_ns), None) = (net.mmds_ns.as_ref(), states.mmds.as_ref()) { let mmds_guard = mmds_ns.mmds.lock().expect("Poisoned lock"); states.mmds = Some(MmdsState { @@ -336,11 +289,13 @@ impl<'a> Persist<'a> for MMIODeviceManager { }); } - states.net_devices.push(ConnectedNetState { - device_id: devid.clone(), - device_state: net.save(), + net.prepare_save(); + let device_state = net.save(); + states.net_devices.push(VirtioDeviceState { + device_id, + device_state, transport_state, - device_info: device.resources, + device_info, }); } TYPE_VSOCK => { @@ -360,16 +315,16 @@ impl<'a> Persist<'a> for MMIODeviceManager { // Save state after potential notification to the guest. This // way we save changes to the queue the notification can cause. - let vsock_state = VsockState { + let device_state = VsockState { backend: vsock.backend().save(), frontend: vsock.save(), }; - states.vsock_device = Some(ConnectedVsockState { - device_id: devid.clone(), - device_state: vsock_state, + states.vsock_device = Some(VirtioDeviceState { + device_id, + device_state, transport_state, - device_info: device.resources, + device_info, }); } TYPE_RNG => { @@ -377,12 +332,13 @@ impl<'a> Persist<'a> for MMIODeviceManager { .as_mut_any() .downcast_mut::() .unwrap(); + let device_state = entropy.save(); - states.entropy_device = Some(ConnectedEntropyState { - device_id: devid.clone(), - device_state: entropy.save(), + states.entropy_device = Some(VirtioDeviceState { + device_id, + device_state, transport_state, - device_info: device.resources, + device_info, }); } _ => unreachable!(), @@ -432,19 +388,20 @@ impl<'a> Persist<'a> for MMIODeviceManager { } let mut restore_helper = |device: Arc>, + activated: bool, is_vhost_user: bool, as_subscriber: Arc>, id: &String, state: &MmioTransportState, - interrupt: Arc, device_info: &MMIODeviceInfo, mmio_bus: &vm_device::Bus, event_manager: &mut EventManager| -> Result<(), Self::Error> { + let interrupt = Arc::new(IrqTrigger::new()); let restore_args = MmioTransportConstructorArgs { mem: mem.clone(), - interrupt, - device, + interrupt: interrupt.clone(), + device: device.clone(), is_vhost_user, }; let mmio_transport = Arc::new(Mutex::new( @@ -462,16 +419,21 @@ impl<'a> Persist<'a> for MMIODeviceManager { }, )?; + if activated { + device + .lock() + .expect("Poisoned lock") + .activate(mem.clone(), interrupt)?; + } + event_manager.add_subscriber(as_subscriber); Ok(()) }; if let Some(balloon_state) = &state.balloon_device { - let interrupt = Arc::new(IrqTrigger::new()); let device = Arc::new(Mutex::new(Balloon::restore( BalloonConstructorArgs { mem: mem.clone(), - interrupt: interrupt.clone(), restored_from_file: constructor_args.restored_from_file, }, &balloon_state.device_state, @@ -483,11 +445,11 @@ impl<'a> Persist<'a> for MMIODeviceManager { restore_helper( device.clone(), + balloon_state.device_state.virtio_state.activated, false, device, &balloon_state.device_id, &balloon_state.transport_state, - interrupt, &balloon_state.device_info, &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, @@ -495,12 +457,8 @@ impl<'a> Persist<'a> for MMIODeviceManager { } for block_state in &state.block_devices { - let interrupt = Arc::new(IrqTrigger::new()); let device = Arc::new(Mutex::new(Block::restore( - BlockConstructorArgs { - mem: mem.clone(), - interrupt: interrupt.clone(), - }, + BlockConstructorArgs { mem: mem.clone() }, &block_state.device_state, )?)); @@ -510,11 +468,11 @@ impl<'a> Persist<'a> for MMIODeviceManager { restore_helper( device.clone(), + block_state.device_state.is_activated(), false, device, &block_state.device_id, &block_state.transport_state, - interrupt, &block_state.device_info, &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, @@ -531,11 +489,9 @@ impl<'a> Persist<'a> for MMIODeviceManager { } for net_state in &state.net_devices { - let interrupt = Arc::new(IrqTrigger::new()); let device = Arc::new(Mutex::new(Net::restore( NetConstructorArgs { mem: mem.clone(), - interrupt: interrupt.clone(), mmds: constructor_args .vm_resources .mmds @@ -552,11 +508,11 @@ impl<'a> Persist<'a> for MMIODeviceManager { restore_helper( device.clone(), + net_state.device_state.virtio_state.activated, false, device, &net_state.device_id, &net_state.transport_state, - interrupt, &net_state.device_info, &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, @@ -568,11 +524,9 @@ impl<'a> Persist<'a> for MMIODeviceManager { cid: vsock_state.device_state.frontend.cid, }; let backend = VsockUnixBackend::restore(ctor_args, &vsock_state.device_state.backend)?; - let interrupt = Arc::new(IrqTrigger::new()); let device = Arc::new(Mutex::new(Vsock::restore( VsockConstructorArgs { mem: mem.clone(), - interrupt: interrupt.clone(), backend, }, &vsock_state.device_state.frontend, @@ -584,11 +538,11 @@ impl<'a> Persist<'a> for MMIODeviceManager { restore_helper( device.clone(), + vsock_state.device_state.frontend.virtio_state.activated, false, device, &vsock_state.device_id, &vsock_state.transport_state, - interrupt, &vsock_state.device_info, &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, @@ -596,8 +550,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { } if let Some(entropy_state) = &state.entropy_device { - let interrupt = Arc::new(IrqTrigger::new()); - let ctor_args = EntropyConstructorArgs::new(mem.clone(), interrupt.clone()); + let ctor_args = EntropyConstructorArgs { mem: mem.clone() }; let device = Arc::new(Mutex::new(Entropy::restore( ctor_args, @@ -610,11 +563,11 @@ impl<'a> Persist<'a> for MMIODeviceManager { restore_helper( device.clone(), + entropy_state.device_state.virtio_state.activated, false, device, &entropy_state.device_id, &entropy_state.transport_state, - interrupt, &entropy_state.device_info, &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, @@ -640,29 +593,8 @@ mod tests { use crate::vmm_config::net::NetworkInterfaceConfig; use crate::vmm_config::vsock::VsockDeviceConfig; - impl PartialEq for ConnectedBalloonState { - fn eq(&self, other: &ConnectedBalloonState) -> bool { - // Actual device state equality is checked by the device's tests. - self.transport_state == other.transport_state && self.device_info == other.device_info - } - } - - impl PartialEq for ConnectedBlockState { - fn eq(&self, other: &ConnectedBlockState) -> bool { - // Actual device state equality is checked by the device's tests. - self.transport_state == other.transport_state && self.device_info == other.device_info - } - } - - impl PartialEq for ConnectedNetState { - fn eq(&self, other: &ConnectedNetState) -> bool { - // Actual device state equality is checked by the device's tests. - self.transport_state == other.transport_state && self.device_info == other.device_info - } - } - - impl PartialEq for ConnectedVsockState { - fn eq(&self, other: &ConnectedVsockState) -> bool { + impl PartialEq for VirtioDeviceState { + fn eq(&self, other: &VirtioDeviceState) -> bool { // Actual device state equality is checked by the device's tests. self.transport_state == other.transport_state && self.device_info == other.device_info } @@ -674,6 +606,7 @@ mod tests { && self.block_devices == other.block_devices && self.net_devices == other.net_devices && self.vsock_device == other.vsock_device + && self.entropy_device == other.entropy_device } } diff --git a/src/vmm/src/devices/virtio/balloon/persist.rs b/src/vmm/src/devices/virtio/balloon/persist.rs index a6634d07170..15ae1e26b9e 100644 --- a/src/vmm/src/devices/virtio/balloon/persist.rs +++ b/src/vmm/src/devices/virtio/balloon/persist.rs @@ -87,7 +87,7 @@ pub struct BalloonState { stats_desc_index: Option, latest_stats: BalloonStatsState, config_space: BalloonConfigSpaceState, - virtio_state: VirtioDeviceState, + pub virtio_state: VirtioDeviceState, } /// Auxiliary structure for creating a device when resuming from a snapshot. @@ -95,8 +95,6 @@ pub struct BalloonState { pub struct BalloonConstructorArgs { /// Pointer to guest memory. pub mem: GuestMemoryMmap, - /// Interrupt used from the device. - pub interrupt: Arc, pub restored_from_file: bool, } @@ -154,25 +152,18 @@ impl Persist<'_> for Balloon { actual_pages: state.config_space.actual_pages, }; - if state.virtio_state.activated { - balloon.device_state = DeviceState::Activated(ActiveState { - mem: constructor_args.mem, - interrupt: constructor_args.interrupt, - }); - - if balloon.stats_enabled() { - // Restore the stats descriptor. - balloon.set_stats_desc_index(state.stats_desc_index); - - // Restart timer if needed. - let timer_state = TimerState::Periodic { - current: Duration::from_secs(u64::from(state.stats_polling_interval_s)), - interval: Duration::from_secs(u64::from(state.stats_polling_interval_s)), - }; - balloon - .stats_timer - .set_state(timer_state, SetTimeFlags::Default); - } + if state.virtio_state.activated && balloon.stats_enabled() { + // Restore the stats descriptor. + balloon.set_stats_desc_index(state.stats_desc_index); + + // Restart timer if needed. + let timer_state = TimerState::Periodic { + current: Duration::from_secs(u64::from(state.stats_polling_interval_s)), + interval: Duration::from_secs(u64::from(state.stats_polling_interval_s)), + }; + balloon + .stats_timer + .set_state(timer_state, SetTimeFlags::Default); } Ok(balloon) @@ -202,7 +193,6 @@ mod tests { let restored_balloon = Balloon::restore( BalloonConstructorArgs { mem: guest_mem, - interrupt: default_interrupt(), restored_from_file: true, }, &Snapshot::deserialize(&mut mem.as_slice()).unwrap(), diff --git a/src/vmm/src/devices/virtio/block/persist.rs b/src/vmm/src/devices/virtio/block/persist.rs index 57712a8fb3a..cb9a6471137 100644 --- a/src/vmm/src/devices/virtio/block/persist.rs +++ b/src/vmm/src/devices/virtio/block/persist.rs @@ -17,9 +17,17 @@ pub enum BlockState { VhostUser(VhostUserBlockState), } +impl BlockState { + pub fn is_activated(&self) -> bool { + match self { + BlockState::Virtio(virtio_block_state) => virtio_block_state.virtio_state.activated, + BlockState::VhostUser(vhost_user_block_state) => false, + } + } +} + /// Auxiliary structure for creating a device when resuming from a snapshot. #[derive(Debug)] pub struct BlockConstructorArgs { pub mem: GuestMemoryMmap, - pub interrupt: Arc, } diff --git a/src/vmm/src/devices/virtio/block/virtio/persist.rs b/src/vmm/src/devices/virtio/block/virtio/persist.rs index 57e4a11b9c1..1c7a1bce106 100644 --- a/src/vmm/src/devices/virtio/block/virtio/persist.rs +++ b/src/vmm/src/devices/virtio/block/virtio/persist.rs @@ -58,7 +58,7 @@ pub struct VirtioBlockState { cache_type: CacheType, root_device: bool, disk_path: String, - virtio_state: VirtioDeviceState, + pub virtio_state: VirtioDeviceState, rate_limiter_state: RateLimiterState, file_engine_type: FileEngineTypeState, } @@ -111,15 +111,6 @@ impl Persist<'_> for VirtioBlock { let avail_features = state.virtio_state.avail_features; let acked_features = state.virtio_state.acked_features; - let device_state = if state.virtio_state.activated { - DeviceState::Activated(ActiveState { - mem: constructor_args.mem, - interrupt: constructor_args.interrupt, - }) - } else { - DeviceState::Inactive - }; - let config_space = ConfigSpace { capacity: disk_properties.nsectors.to_le(), }; @@ -132,7 +123,7 @@ impl Persist<'_> for VirtioBlock { queues, queue_evts, - device_state, + device_state: DeviceState::Inactive, id: state.id.clone(), partuuid: state.partuuid.clone(), @@ -227,10 +218,7 @@ mod tests { // Restore the block device. let restored_block = VirtioBlock::restore( - BlockConstructorArgs { - mem: guest_mem, - interrupt: default_interrupt(), - }, + BlockConstructorArgs { mem: guest_mem }, &Snapshot::deserialize(&mut mem.as_slice()).unwrap(), ) .unwrap(); diff --git a/src/vmm/src/devices/virtio/net/device.rs b/src/vmm/src/devices/virtio/net/device.rs index cf9f445d5df..4c6022a0067 100755 --- a/src/vmm/src/devices/virtio/net/device.rs +++ b/src/vmm/src/devices/virtio/net/device.rs @@ -8,6 +8,7 @@ use std::collections::VecDeque; use std::mem::{self}; use std::net::Ipv4Addr; +use std::num::Wrapping; use std::ops::Deref; use std::sync::{Arc, Mutex}; @@ -936,6 +937,26 @@ impl Net { Ok(()) } + + /// Prepare saving state + pub fn prepare_save(&mut self) { + // We shouldn't be messing with the queue if the device is not activated. + // Anyways, if it isn't there's nothing to prepare; we haven't parsed any + // descriptors yet from it and we can't have a deferred frame. + if !self.is_activated() { + return; + } + + // Give potential deferred RX frame to guest + self.rx_buffer.finish_frame(&mut self.queues[RX_INDEX]); + // Reset the parsed available descriptors, so we will re-parse them + self.queues[RX_INDEX].next_avail -= + Wrapping(u16::try_from(self.rx_buffer.parsed_descriptors.len()).unwrap()); + self.rx_buffer.parsed_descriptors.clear(); + self.rx_buffer.iovec.clear(); + self.rx_buffer.used_bytes = 0; + self.rx_buffer.used_descriptors = 0; + } } impl VirtioDevice for Net { diff --git a/src/vmm/src/devices/virtio/net/persist.rs b/src/vmm/src/devices/virtio/net/persist.rs index 5ebd15f9d54..6ef8ad842ac 100644 --- a/src/vmm/src/devices/virtio/net/persist.rs +++ b/src/vmm/src/devices/virtio/net/persist.rs @@ -30,27 +30,6 @@ pub struct NetConfigSpaceState { guest_mac: Option, } -/// Information about the parsed RX buffers -#[derive(Debug, Default, Clone, Serialize, Deserialize)] -pub struct RxBufferState { - // Number of iovecs we have parsed from the guest - parsed_descriptor_chains_nr: u16, - // Number of used descriptors - used_descriptors: u16, - // Number of used bytes - used_bytes: u32, -} - -impl RxBufferState { - fn from_rx_buffers(rx_buffer: &RxBuffers) -> Self { - RxBufferState { - parsed_descriptor_chains_nr: rx_buffer.parsed_descriptors.len().try_into().unwrap(), - used_descriptors: rx_buffer.used_descriptors, - used_bytes: rx_buffer.used_bytes, - } - } -} - /// Information about the network device that are saved /// at snapshot. #[derive(Debug, Clone, Serialize, Deserialize)] @@ -62,8 +41,7 @@ pub struct NetState { /// The associated MMDS network stack. pub mmds_ns: Option, config_space: NetConfigSpaceState, - virtio_state: VirtioDeviceState, - rx_buffers_state: RxBufferState, + pub virtio_state: VirtioDeviceState, } /// Auxiliary structure for creating a device when resuming from a snapshot. @@ -71,8 +49,6 @@ pub struct NetState { pub struct NetConstructorArgs { /// Pointer to guest memory. pub mem: GuestMemoryMmap, - /// Interrupt for the device. - pub interrupt: Arc, /// Pointer to the MMDS data store. pub mmds: Option>>, } @@ -108,7 +84,6 @@ impl Persist<'_> for Net { guest_mac: self.guest_mac, }, virtio_state: VirtioDeviceState::from_device(self), - rx_buffers_state: RxBufferState::from_rx_buffers(&self.rx_buffer), } } @@ -153,26 +128,6 @@ impl Persist<'_> for Net { net.avail_features = state.virtio_state.avail_features; net.acked_features = state.virtio_state.acked_features; - if state.virtio_state.activated { - let supported_flags: u32 = Net::build_tap_offload_features(net.acked_features); - net.tap - .set_offload(supported_flags) - .map_err(NetPersistError::TapSetOffload)?; - - net.device_state = DeviceState::Activated(ActiveState { - mem: constructor_args.mem, - interrupt: constructor_args.interrupt, - }); - - // Recreate `Net::rx_buffer`. We do it by re-parsing the RX queue. We're temporarily - // rolling back `next_avail` in the RX queue and call `parse_rx_descriptors`. - net.queues[RX_INDEX].next_avail -= state.rx_buffers_state.parsed_descriptor_chains_nr; - net.parse_rx_descriptors() - .map_err(|e| NetPersistError::VirtioState(VirtioStateError::InvalidAvailIdx(e)))?; - net.rx_buffer.used_descriptors = state.rx_buffers_state.used_descriptors; - net.rx_buffer.used_bytes = state.rx_buffers_state.used_bytes; - } - Ok(net) } } @@ -216,7 +171,6 @@ mod tests { match Net::restore( NetConstructorArgs { mem: guest_mem, - interrupt: default_interrupt(), mmds: mmds_ds, }, &Snapshot::deserialize(&mut mem.as_slice()).unwrap(), diff --git a/src/vmm/src/devices/virtio/rng/persist.rs b/src/vmm/src/devices/virtio/rng/persist.rs index 75db947c9c7..d266e259418 100644 --- a/src/vmm/src/devices/virtio/rng/persist.rs +++ b/src/vmm/src/devices/virtio/rng/persist.rs @@ -19,20 +19,13 @@ use crate::vstate::memory::GuestMemoryMmap; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct EntropyState { - virtio_state: VirtioDeviceState, + pub virtio_state: VirtioDeviceState, rate_limiter_state: RateLimiterState, } #[derive(Debug)] pub struct EntropyConstructorArgs { - mem: GuestMemoryMmap, - interrupt: Arc, -} - -impl EntropyConstructorArgs { - pub fn new(mem: GuestMemoryMmap, interrupt: Arc) -> Self { - Self { mem, interrupt } - } + pub mem: GuestMemoryMmap, } #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -72,9 +65,6 @@ impl Persist<'_> for Entropy { let mut entropy = Entropy::new_with_queues(queues, rate_limiter)?; entropy.set_avail_features(state.virtio_state.avail_features); entropy.set_acked_features(state.virtio_state.acked_features); - if state.virtio_state.activated { - entropy.set_activated(constructor_args.mem, constructor_args.interrupt); - } Ok(entropy) } @@ -99,7 +89,7 @@ mod tests { let guest_mem = create_virtio_mem(); let restored = Entropy::restore( - EntropyConstructorArgs::new(guest_mem, default_interrupt()), + EntropyConstructorArgs { mem: guest_mem }, &Snapshot::deserialize(&mut mem.as_slice()).unwrap(), ) .unwrap(); diff --git a/src/vmm/src/devices/virtio/vsock/persist.rs b/src/vmm/src/devices/virtio/vsock/persist.rs index 9d2fd61d9d5..6775707da3e 100644 --- a/src/vmm/src/devices/virtio/vsock/persist.rs +++ b/src/vmm/src/devices/virtio/vsock/persist.rs @@ -31,7 +31,7 @@ pub struct VsockState { pub struct VsockFrontendState { /// Context Identifier. pub cid: u64, - virtio_state: VirtioDeviceState, + pub virtio_state: VirtioDeviceState, } /// An enum for the serializable backend state types. @@ -53,8 +53,6 @@ pub struct VsockUdsState { pub struct VsockConstructorArgs { /// Pointer to guest memory. pub mem: GuestMemoryMmap, - /// Interrupt to use for the device. - pub interrupt: Arc, /// The vsock Unix Backend. pub backend: B, } @@ -123,14 +121,7 @@ where vsock.acked_features = state.virtio_state.acked_features; vsock.avail_features = state.virtio_state.avail_features; - vsock.device_state = if state.virtio_state.activated { - DeviceState::Activated(ActiveState { - mem: constructor_args.mem, - interrupt: constructor_args.interrupt, - }) - } else { - DeviceState::Inactive - }; + vsock.device_state = DeviceState::Inactive; Ok(vsock) } } @@ -193,7 +184,6 @@ pub(crate) mod tests { let mut restored_device = Vsock::restore( VsockConstructorArgs { mem: ctx.mem.clone(), - interrupt: default_interrupt(), backend: match restored_state.backend { VsockBackendState::Uds(uds_state) => { assert_eq!(uds_state.path, "test".to_owned()); diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index f842e44c6a5..77d0e6c85b8 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -388,7 +388,7 @@ impl Vmm { /// Sends a resume command to the vCPUs. pub fn resume_vm(&mut self) -> Result<(), VmmError> { - self.device_manager.mmio_devices.kick_devices(); + self.device_manager.kick_virtio_devices(); // Send the events. self.vcpus_handles diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index 54804717a19..80b47f86076 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -166,21 +166,8 @@ pub fn create_snapshot( // We need to mark queues as dirty again for all activated devices. The reason we // do it here is that we don't mark pages as dirty during runtime // for queue objects. - // SAFETY: - // This should never fail as we only mark pages only if device has already been activated, - // and the address validation was already performed on device activation. vmm.device_manager - .mmio_devices - .for_each_virtio_device(|_, _, device| { - let mmio_dev_locked = device.inner.lock().expect("Poisoned lock"); - let mut d = mmio_dev_locked.locked_device(); - if d.is_activated() { - d.mark_queue_memory_dirty(vmm.vm.guest_memory()) - } else { - Ok(()) - } - }) - .unwrap(); + .mark_virtio_queue_memory_dirty(vmm.vm.guest_memory()); Ok(()) } From 72fbaed570064e8f79155701d6a58b2e89b6556d Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Thu, 5 Jun 2025 11:39:21 +0200 Subject: [PATCH 46/99] pci: support snapshotting VirtIO PCI devices Support serializing the device-specific and transport state of a VirtIO device that uses the PCI transport. Signed-off-by: Babis Chalios --- src/vmm/src/builder.rs | 4 +- src/vmm/src/device_manager/mod.rs | 19 +- src/vmm/src/device_manager/pci_mngr.rs | 425 +++++++++++++++++- src/vmm/src/device_manager/persist.rs | 4 +- .../virtio/transport/pci/common_config.rs | 2 +- .../devices/virtio/transport/pci/device.rs | 173 ++++--- 6 files changed, 557 insertions(+), 70 deletions(-) diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 44776f8c102..a3c100ba83a 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -451,6 +451,8 @@ pub fn build_microvm_from_snapshot( // Restore the boot source config paths. vm_resources.boot_source.config = microvm_state.vm_info.boot_source; + let vm = Arc::new(vm); + // Restore devices states. // Restoring VMGenID injects an interrupt in the guest to notify it about the new generation // ID. As a result, we need to restore DeviceManager after restoring the KVM state, otherwise @@ -473,7 +475,7 @@ pub fn build_microvm_from_snapshot( instance_info: instance_info.clone(), shutdown_exit_code: None, kvm, - vm: Arc::new(vm), + vm, uffd, vcpus_handles: Vec::new(), vcpus_exit_evt, diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index 95e04111b13..260f3337673 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -357,6 +357,11 @@ impl DeviceManager { Self::do_kick_device(mmio_transport_locked.device()); Ok(()) }); + // Go through PCI VirtIO devices + for device in self.pci_devices.virtio_devices.values() { + let virtio_device = device.lock().expect("Poisoned lock").virtio_device(); + Self::do_kick_device(virtio_device); + } } fn do_mark_virtio_queue_memory_dirty( @@ -380,6 +385,12 @@ impl DeviceManager { Self::do_mark_virtio_queue_memory_dirty(mmio_transport_locked.device(), mem); Ok(()) }); + + // Go through PCI VirtIO devices + for device in self.pci_devices.virtio_devices.values() { + let virtio_device = device.lock().expect("Poisoned lock").virtio_device(); + Self::do_mark_virtio_queue_memory_dirty(virtio_device, mem); + } } } @@ -416,7 +427,7 @@ pub enum DevicePersistError { pub struct DeviceRestoreArgs<'a> { pub mem: &'a GuestMemoryMmap, - pub vm: &'a Vm, + pub vm: &'a Arc, pub event_manager: &'a mut EventManager, pub vcpus_exit_evt: &'a EventFd, pub vm_resources: &'a mut VmResources, @@ -491,6 +502,12 @@ impl<'a> Persist<'a> for DeviceManager { // Restore PCI devices let pci_ctor_args = PciDevicesConstructorArgs { resource_allocator: &resource_allocator, + vm: constructor_args.vm.clone(), + mem: constructor_args.mem, + vm_resources: constructor_args.vm_resources, + instance_id: constructor_args.instance_id, + restored_from_file: constructor_args.restored_from_file, + event_manager: constructor_args.event_manager, }; let pci_devices = PciDevices::restore(pci_ctor_args, &state.pci_state)?; diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs index 70bb03388f6..d2c8ae27528 100644 --- a/src/vmm/src/device_manager/pci_mngr.rs +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -5,19 +5,38 @@ use std::collections::HashMap; use std::fmt::Debug; use std::sync::{Arc, Mutex}; -use event_manager::MutEventSubscriber; -use log::debug; +use event_manager::{MutEventSubscriber, SubscriberOps}; +use log::{debug, error, warn}; use pci::{PciBarRegionType, PciDevice, PciDeviceError, PciRootError}; use serde::{Deserialize, Serialize}; use vm_device::BusError; -use crate::Vm; +use super::persist::{MmdsState, SharedDeviceType}; use crate::device_manager::resources::ResourceAllocator; use crate::devices::pci::PciSegment; +use crate::devices::virtio::balloon::Balloon; +use crate::devices::virtio::balloon::persist::{BalloonConstructorArgs, BalloonState}; +use crate::devices::virtio::block::device::Block; +use crate::devices::virtio::block::persist::{BlockConstructorArgs, BlockState}; use crate::devices::virtio::device::VirtioDevice; -use crate::devices::virtio::transport::pci::device::{VirtioPciDevice, VirtioPciDeviceError}; +use crate::devices::virtio::net::Net; +use crate::devices::virtio::net::persist::{NetConstructorArgs, NetState}; +use crate::devices::virtio::rng::Entropy; +use crate::devices::virtio::rng::persist::{EntropyConstructorArgs, EntropyState}; +use crate::devices::virtio::transport::pci::device::{ + VirtioPciDevice, VirtioPciDeviceError, VirtioPciDeviceState, +}; +use crate::devices::virtio::vsock::persist::{ + VsockConstructorArgs, VsockState, VsockUdsConstructorArgs, +}; +use crate::devices::virtio::vsock::{TYPE_VSOCK, Vsock, VsockUnixBackend}; +use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET, TYPE_RNG}; +use crate::resources::VmResources; use crate::snapshot::Persist; -use crate::vstate::vm::InterruptError; +use crate::vmm_config::mmds::MmdsConfigError; +use crate::vstate::memory::GuestMemoryMmap; +use crate::vstate::vm::{InterruptError, MsiVectorGroup}; +use crate::{EventManager, Vm}; #[derive(Debug, Default)] pub struct PciDevices { @@ -43,6 +62,8 @@ pub enum PciManagerError { PciDeviceError(#[from] PciDeviceError), /// KVM error: {0} Kvm(#[from] vmm_sys_util::errno::Error), + /// MMDS error: {0} + Mmds(#[from] MmdsConfigError), } impl PciDevices { @@ -119,6 +140,7 @@ impl PciDevices { let pci_device_bdf = pci_segment.next_device_bdf()?; debug!("Allocating BDF: {pci_device_bdf:?} for device"); let mem = vm.guest_memory().clone(); + let device_type: u32 = device.lock().expect("Poisoned lock").device_type(); // Allocate one MSI vector per queue, plus one for configuration let msix_num = @@ -153,6 +175,9 @@ impl PciDevices { .expect("Poisoned lock") .add_device(pci_device_bdf.device() as u32, virtio_device.clone())?; + self.virtio_devices + .insert((device_type, id.clone()), virtio_device.clone()); + Self::register_bars_with_bus(resource_allocator, &virtio_device)?; virtio_device .lock() @@ -162,6 +187,54 @@ impl PciDevices { Ok(()) } + fn restore_pci_device( + &mut self, + vm: &Arc, + resource_allocator: &ResourceAllocator, + device: Arc>, + device_id: &str, + transport_state: &VirtioPciDeviceState, + event_manager: &mut EventManager, + ) -> Result<(), PciManagerError> { + // We should only be reaching this point if PCI is enabled + let pci_segment = self.pci_segment.as_ref().unwrap(); + let msi_vector_group = Arc::new(MsiVectorGroup::restore( + vm.clone(), + &transport_state.msi_vector_group, + )?); + let device_type: u32 = device.lock().expect("Poisoned lock").device_type(); + + let virtio_device = Arc::new(Mutex::new(VirtioPciDevice::new_from_state( + device_id.to_string(), + vm.guest_memory().clone(), + device.clone(), + msi_vector_group, + transport_state.clone(), + )?)); + + pci_segment + .pci_bus + .lock() + .expect("Poisoned lock") + .add_device( + transport_state.pci_device_bdf.device() as u32, + virtio_device.clone(), + )?; + + self.virtio_devices + .insert((device_type, device_id.to_string()), virtio_device.clone()); + + Self::register_bars_with_bus(resource_allocator, &virtio_device)?; + virtio_device + .lock() + .expect("Poisoned lock") + .register_notification_ioevent(vm)?; + + event_manager.add_subscriber(device); + + Ok(()) + } + /// Gets the specified device. pub fn get_virtio_device( &self, @@ -173,14 +246,57 @@ impl PciDevices { } } +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VirtioDeviceState { + /// Device identifier + pub device_id: String, + /// Device BDF + pub pci_device_bdf: u32, + /// Device state + pub device_state: T, + /// Transport state + pub transport_state: VirtioPciDeviceState, +} + #[derive(Default, Debug, Clone, Serialize, Deserialize)] pub struct PciDevicesState { - pci_enabled: bool, + /// Whether PCI is enabled + pub pci_enabled: bool, + /// Block device states. + pub block_devices: Vec>, + /// Net device states. + pub net_devices: Vec>, + /// Vsock device state. + pub vsock_device: Option>, + /// Balloon device state. + pub balloon_device: Option>, + /// Mmds state. + pub mmds: Option, + /// Entropy device state. + pub entropy_device: Option>, } -#[derive(Debug)] pub struct PciDevicesConstructorArgs<'a> { + pub vm: Arc, + pub mem: &'a GuestMemoryMmap, pub resource_allocator: &'a Arc, + pub vm_resources: &'a mut VmResources, + pub instance_id: &'a str, + pub restored_from_file: bool, + pub event_manager: &'a mut EventManager, +} + +impl<'a> Debug for PciDevicesConstructorArgs<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("PciDevicesConstructorArgs") + .field("vm", &self.vm) + .field("mem", &self.mem) + .field("resource_allocator", &self.resource_allocator) + .field("vm_resources", &self.vm_resources) + .field("instance_id", &self.instance_id) + .field("restored_from_file", &self.restored_from_file) + .finish() + } } impl<'a> Persist<'a> for PciDevices { @@ -189,19 +305,306 @@ impl<'a> Persist<'a> for PciDevices { type Error = PciManagerError; fn save(&self) -> Self::State { - PciDevicesState { - pci_enabled: self.pci_segment.is_some(), + let mut state = PciDevicesState::default(); + if self.pci_segment.is_some() { + state.pci_enabled = true; + } else { + return state; } + + for pci_dev in self.virtio_devices.values() { + let locked_pci_dev = pci_dev.lock().expect("Poisoned lock"); + let transport_state = locked_pci_dev.state(); + let virtio_dev = locked_pci_dev.virtio_device(); + let mut locked_virtio_dev = virtio_dev.lock().expect("Poisoned lock"); + + let pci_device_bdf = transport_state.pci_device_bdf.into(); + + match locked_virtio_dev.device_type() { + TYPE_BALLOON => { + let balloon_device = locked_virtio_dev + .as_any() + .downcast_ref::() + .unwrap(); + + let device_state = balloon_device.save(); + + state.balloon_device = Some(VirtioDeviceState { + device_id: balloon_device.id().to_string(), + pci_device_bdf, + device_state, + transport_state, + }); + } + TYPE_BLOCK => { + let block_dev = locked_virtio_dev + .as_mut_any() + .downcast_mut::() + .unwrap(); + if block_dev.is_vhost_user() { + warn!( + "Skipping vhost-user-block device. VhostUserBlock does not support \ + snapshotting yet" + ); + } else { + block_dev.prepare_save(); + let device_state = block_dev.save(); + state.block_devices.push(VirtioDeviceState { + device_id: block_dev.id().to_string(), + pci_device_bdf, + device_state, + transport_state, + }); + } + } + TYPE_NET => { + let net_dev = locked_virtio_dev + .as_mut_any() + .downcast_mut::() + .unwrap(); + if let (Some(mmds_ns), None) = (net_dev.mmds_ns.as_ref(), state.mmds.as_ref()) { + let mmds_guard = mmds_ns.mmds.lock().expect("Poisoned lock"); + state.mmds = Some(MmdsState { + version: mmds_guard.version(), + imds_compat: mmds_guard.imds_compat(), + }); + } + net_dev.prepare_save(); + let device_state = net_dev.save(); + + state.net_devices.push(VirtioDeviceState { + device_id: net_dev.id().to_string(), + pci_device_bdf, + device_state, + transport_state, + }) + } + TYPE_VSOCK => { + let vsock_dev = locked_virtio_dev + .as_mut_any() + // Currently, VsockUnixBackend is the only implementation of VsockBackend. + .downcast_mut::>() + .unwrap(); + + // Send Transport event to reset connections if device + // is activated. + if vsock_dev.is_activated() { + vsock_dev + .send_transport_reset_event() + .unwrap_or_else(|err| { + error!("Failed to send reset transport event: {:?}", err); + }); + } + + // Save state after potential notification to the guest. This + // way we save changes to the queue the notification can cause. + let vsock_state = VsockState { + backend: vsock_dev.backend().save(), + frontend: vsock_dev.save(), + }; + + state.vsock_device = Some(VirtioDeviceState { + device_id: vsock_dev.id().to_string(), + pci_device_bdf, + device_state: vsock_state, + transport_state, + }); + } + TYPE_RNG => { + let rng_dev = locked_virtio_dev + .as_mut_any() + .downcast_mut::() + .unwrap(); + let device_state = rng_dev.save(); + + state.entropy_device = Some(VirtioDeviceState { + device_id: rng_dev.id().to_string(), + pci_device_bdf, + device_state, + transport_state, + }) + } + _ => unreachable!(), + } + } + + state } fn restore( constructor_args: Self::ConstructorArgs, state: &Self::State, ) -> std::result::Result { + let mem = constructor_args.mem; let mut pci_devices = PciDevices::new(); + if !state.pci_enabled { + return Ok(pci_devices); + } + + pci_devices.attach_pci_segment(constructor_args.resource_allocator)?; + + if let Some(balloon_state) = &state.balloon_device { + let device = Arc::new(Mutex::new( + Balloon::restore( + BalloonConstructorArgs { + mem: mem.clone(), + restored_from_file: constructor_args.restored_from_file, + }, + &balloon_state.device_state, + ) + .unwrap(), + )); + + constructor_args + .vm_resources + .update_from_restored_device(SharedDeviceType::Balloon(device.clone())) + .unwrap(); + + pci_devices + .restore_pci_device( + &constructor_args.vm, + constructor_args.resource_allocator, + device, + &balloon_state.device_id, + &balloon_state.transport_state, + constructor_args.event_manager, + ) + .unwrap() + } + + for block_state in &state.block_devices { + let device = Arc::new(Mutex::new( + Block::restore( + BlockConstructorArgs { mem: mem.clone() }, + &block_state.device_state, + ) + .unwrap(), + )); + + constructor_args + .vm_resources + .update_from_restored_device(SharedDeviceType::VirtioBlock(device.clone())) + .unwrap(); + + pci_devices + .restore_pci_device( + &constructor_args.vm, + constructor_args.resource_allocator, + device, + &block_state.device_id, + &block_state.transport_state, + constructor_args.event_manager, + ) + .unwrap() + } + + // Initialize MMDS if MMDS state is included. + if let Some(mmds) = &state.mmds { + constructor_args + .vm_resources + .set_mmds_basic_config(mmds.version, mmds.imds_compat, constructor_args.instance_id) + .unwrap(); + } else if state + .net_devices + .iter() + .any(|dev| dev.device_state.mmds_ns.is_some()) + { + // If there's at least one network device having an mmds_ns, it means + // that we are restoring from a version that did not persist the `MmdsVersionState`. + // Init with the default. + constructor_args.vm_resources.mmds_or_default()?; + } + + for net_state in &state.net_devices { + let device = Arc::new(Mutex::new( + Net::restore( + NetConstructorArgs { + mem: mem.clone(), + mmds: constructor_args + .vm_resources + .mmds + .as_ref() + // Clone the Arc reference. + .cloned(), + }, + &net_state.device_state, + ) + .unwrap(), + )); + + constructor_args + .vm_resources + .update_from_restored_device(SharedDeviceType::Network(device.clone())) + .unwrap(); + + pci_devices + .restore_pci_device( + &constructor_args.vm, + constructor_args.resource_allocator, + device, + &net_state.device_id, + &net_state.transport_state, + constructor_args.event_manager, + ) + .unwrap() + } + + if let Some(vsock_state) = &state.vsock_device { + let ctor_args = VsockUdsConstructorArgs { + cid: vsock_state.device_state.frontend.cid, + }; + let backend = + VsockUnixBackend::restore(ctor_args, &vsock_state.device_state.backend).unwrap(); + let device = Arc::new(Mutex::new( + Vsock::restore( + VsockConstructorArgs { + mem: mem.clone(), + backend, + }, + &vsock_state.device_state.frontend, + ) + .unwrap(), + )); + + constructor_args + .vm_resources + .update_from_restored_device(SharedDeviceType::Vsock(device.clone())) + .unwrap(); + + pci_devices + .restore_pci_device( + &constructor_args.vm, + constructor_args.resource_allocator, + device, + &vsock_state.device_id, + &vsock_state.transport_state, + constructor_args.event_manager, + ) + .unwrap() + } + + if let Some(entropy_state) = &state.entropy_device { + let ctor_args = EntropyConstructorArgs { mem: mem.clone() }; + + let device = Arc::new(Mutex::new( + Entropy::restore(ctor_args, &entropy_state.device_state).unwrap(), + )); + + constructor_args + .vm_resources + .update_from_restored_device(SharedDeviceType::Entropy(device.clone())) + .unwrap(); - if state.pci_enabled { - pci_devices.attach_pci_segment(constructor_args.resource_allocator)?; + pci_devices + .restore_pci_device( + &constructor_args.vm, + constructor_args.resource_allocator, + device, + &entropy_state.device_id, + &entropy_state.transport_state, + constructor_args.event_manager, + ) + .unwrap() } Ok(pci_devices) diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 2b3e2bcd815..db38bb4f4ca 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -107,8 +107,8 @@ pub struct ConnectedLegacyState { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct MmdsState { - version: MmdsVersion, - imds_compat: bool, + pub version: MmdsVersion, + pub imds_compat: bool, } /// Holds the device states. diff --git a/src/vmm/src/devices/virtio/transport/pci/common_config.rs b/src/vmm/src/devices/virtio/transport/pci/common_config.rs index c8ee2d1d2a9..6e52a1ca007 100644 --- a/src/vmm/src/devices/virtio/transport/pci/common_config.rs +++ b/src/vmm/src/devices/virtio/transport/pci/common_config.rs @@ -144,7 +144,7 @@ impl VirtioPciCommonConfig { } } - fn state(&self) -> VirtioPciCommonConfigState { + pub fn state(&self) -> VirtioPciCommonConfigState { VirtioPciCommonConfigState { driver_status: self.driver_status, config_generation: self.config_generation, diff --git a/src/vmm/src/devices/virtio/transport/pci/device.rs b/src/vmm/src/devices/virtio/transport/pci/device.rs index 20c169297fd..6793d502f00 100644 --- a/src/vmm/src/devices/virtio/transport/pci/device.rs +++ b/src/vmm/src/devices/virtio/transport/pci/device.rs @@ -9,6 +9,7 @@ use std::any::Any; use std::cmp; +use std::collections::HashMap; use std::fmt::{Debug, Formatter}; use std::io::Write; use std::sync::atomic::{AtomicBool, AtomicU16, AtomicU32, AtomicUsize, Ordering}; @@ -41,6 +42,7 @@ use crate::devices::virtio::transport::pci::common_config::{ use crate::devices::virtio::transport::{VirtioInterrupt, VirtioInterruptType}; use crate::devices::virtio::{TYPE_BLOCK, TYPE_NET}; use crate::logger::{debug, error}; +use crate::snapshot::Persist; use crate::utils::u64_to_usize; use crate::vstate::memory::GuestMemoryMmap; use crate::vstate::vm::{InterruptError, MsiVectorGroup}; @@ -283,8 +285,8 @@ const NOTIFY_OFF_MULTIPLIER: u32 = 4; // A dword per notification address. const VIRTIO_PCI_VENDOR_ID: u16 = 0x1af4; const VIRTIO_PCI_DEVICE_ID_BASE: u16 = 0x1040; // Add to device type to get device ID. -#[derive(Debug, Serialize, Deserialize)] -struct QueueState { +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct QueueState { max_size: u16, size: u16, ready: bool, @@ -293,14 +295,18 @@ struct QueueState { used_ring: u64, } -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct VirtioPciDeviceState { pub pci_device_bdf: PciBdf, - device_activated: bool, - queues: Vec, - interrupt_status: usize, - cap_pci_cfg_offset: usize, - cap_pci_cfg: Vec, + pub device_activated: bool, + pub interrupt_status: usize, + pub cap_pci_cfg_offset: usize, + pub cap_pci_cfg: Vec, + pub pci_configuration_state: PciConfigurationState, + pub pci_dev_state: VirtioPciCommonConfigState, + pub msix_state: MsixConfigState, + pub msi_vector_group: HashMap, + pub bar_configuration: Vec, } #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -337,7 +343,7 @@ pub struct VirtioPciDevice { // PCI interrupts. interrupt_status: Arc, virtio_interrupt: Option>, - interrupt_source_group: Arc, + interrupt_source_group: Arc, // Guest memory memory: GuestMemoryMmap, @@ -421,7 +427,6 @@ impl VirtioPciDevice { } /// Constructs a new PCI transport for the given virtio device. - #[allow(clippy::too_many_arguments)] pub fn new( id: String, memory: GuestMemoryMmap, @@ -464,7 +469,7 @@ impl VirtioPciDevice { device, device_activated: Arc::new(AtomicBool::new(false)), interrupt_status: Arc::new(AtomicUsize::new(0)), - virtio_interrupt: None, + virtio_interrupt: Some(interrupt), memory, settings_bar: 0, use_64bit_bar: true, @@ -476,6 +481,70 @@ impl VirtioPciDevice { Ok(virtio_pci_device) } + pub fn new_from_state( + id: String, + memory: GuestMemoryMmap, + device: Arc>, + msi_vectors: Arc, + state: VirtioPciDeviceState, + ) -> Result { + let msix_config = Self::msix_config( + state.pci_device_bdf.into(), + msi_vectors.clone(), + Some(state.msix_state), + )?; + + let pci_config = Self::pci_configuration( + device.lock().expect("Poisoned lock").device_type(), + &msix_config, + Some(state.pci_configuration_state), + ); + let virtio_common_config = VirtioPciCommonConfig::new(state.pci_dev_state); + let cap_pci_cfg_info = VirtioPciCfgCapInfo { + offset: state.cap_pci_cfg_offset, + cap: *VirtioPciCfgCap::from_slice(&state.cap_pci_cfg).unwrap(), + }; + + let interrupt = Arc::new(VirtioInterruptMsix::new( + msix_config.clone(), + virtio_common_config.msix_config.clone(), + virtio_common_config.msix_queues.clone(), + msi_vectors.clone(), + )); + + let virtio_pci_device = VirtioPciDevice { + id, + pci_device_bdf: state.pci_device_bdf, + configuration: pci_config, + common_config: virtio_common_config, + msix_config: Some(msix_config), + msix_num: msi_vectors.num_vectors(), + device, + device_activated: Arc::new(AtomicBool::new(state.device_activated)), + interrupt_status: Arc::new(AtomicUsize::new(state.interrupt_status)), + virtio_interrupt: Some(interrupt), + memory: memory.clone(), + settings_bar: 0, + use_64bit_bar: true, + interrupt_source_group: msi_vectors, + cap_pci_cfg_info, + bar_regions: state.bar_configuration, + }; + + if state.device_activated { + virtio_pci_device + .device + .lock() + .expect("Poisoned lock") + .activate( + memory, + virtio_pci_device.virtio_interrupt.as_ref().unwrap().clone(), + ); + } + + Ok(virtio_pci_device) + } + fn is_driver_ready(&self) -> bool { let ready_bits = (DEVICE_ACKNOWLEDGE | DEVICE_DRIVER | DEVICE_DRIVER_OK | DEVICE_FEATURES_OK); @@ -657,6 +726,27 @@ impl VirtioPciDevice { } Ok(()) } + + pub fn state(&self) -> VirtioPciDeviceState { + VirtioPciDeviceState { + pci_device_bdf: self.pci_device_bdf, + device_activated: self.device_activated.load(Ordering::Acquire), + interrupt_status: self.interrupt_status.load(Ordering::Acquire), + cap_pci_cfg_offset: self.cap_pci_cfg_info.offset, + cap_pci_cfg: self.cap_pci_cfg_info.cap.bytes().to_vec(), + pci_configuration_state: self.configuration.state(), + pci_dev_state: self.common_config.state(), + msix_state: self + .msix_config + .as_ref() + .unwrap() + .lock() + .expect("Poisoned lock") + .state(), + msi_vector_group: self.interrupt_source_group.save(), + bar_configuration: self.bar_regions.clone(), + } + } } pub struct VirtioInterruptMsix { @@ -796,57 +886,33 @@ impl PciDevice for VirtioPciDevice { &mut self, mmio32_allocator: &mut AddressAllocator, mmio64_allocator: &mut AddressAllocator, - resources: Option>, + _resources: Option>, ) -> std::result::Result, PciDeviceError> { let mut bars = Vec::new(); let device_clone = self.device.clone(); let device = device_clone.lock().unwrap(); - let mut settings_bar_addr = None; - let mut use_64bit_bar = self.use_64bit_bar; - let restoring = resources.is_some(); - if let Some(resources) = resources { - for resource in resources { - if let Resource::PciBar { - index, base, type_, .. - } = resource - { - if index == VIRTIO_COMMON_BAR_INDEX { - settings_bar_addr = Some(GuestAddress(base)); - use_64bit_bar = match type_ { - PciBarType::Io => { - return Err(PciDeviceError::InvalidResource(resource)); - } - PciBarType::Mmio32 => false, - PciBarType::Mmio64 => true, - }; - break; - } - } - } - // Error out if no resource was matching the BAR id. - if settings_bar_addr.is_none() { - return Err(PciDeviceError::MissingResource); - } - } - // Allocate the virtio-pci capability BAR. // See http://docs.oasis-open.org/virtio/virtio/v1.0/cs04/virtio-v1.0-cs04.html#x1-740004 - let policy = match settings_bar_addr { - Some(addr) => AllocPolicy::ExactMatch(addr.0), - None => AllocPolicy::FirstMatch, - }; - let (virtio_pci_bar_addr, region_type) = if use_64bit_bar { + let (virtio_pci_bar_addr, region_type) = if self.use_64bit_bar { let region_type = PciBarRegionType::Memory64BitRegion; let addr = mmio64_allocator - .allocate(CAPABILITY_BAR_SIZE, CAPABILITY_BAR_SIZE, policy) + .allocate( + CAPABILITY_BAR_SIZE, + CAPABILITY_BAR_SIZE, + AllocPolicy::FirstMatch, + ) .unwrap() .start(); (addr, region_type) } else { let region_type = PciBarRegionType::Memory32BitRegion; let addr = mmio32_allocator - .allocate(CAPABILITY_BAR_SIZE, CAPABILITY_BAR_SIZE, policy) + .allocate( + CAPABILITY_BAR_SIZE, + CAPABILITY_BAR_SIZE, + AllocPolicy::FirstMatch, + ) .unwrap() .start(); (addr, region_type) @@ -862,14 +928,12 @@ impl PciDevice for VirtioPciDevice { // happen only during the creation of a brand new VM. When a VM is // restored from a known state, the BARs are already created with the // right content, therefore we don't need to go through this codepath. - if !restoring { - self.configuration - .add_pci_bar(&bar) - .map_err(|e| PciDeviceError::IoRegistrationFailed(virtio_pci_bar_addr, e))?; + self.configuration + .add_pci_bar(&bar) + .map_err(|e| PciDeviceError::IoRegistrationFailed(virtio_pci_bar_addr, e))?; - // Once the BARs are allocated, the capabilities can be added to the PCI configuration. - self.add_pci_capabilities(VIRTIO_COMMON_BAR_INDEX.try_into().unwrap())?; - } + // Once the BARs are allocated, the capabilities can be added to the PCI configuration. + self.add_pci_capabilities(VIRTIO_COMMON_BAR_INDEX.try_into().unwrap())?; bars.push(bar); @@ -1015,6 +1079,7 @@ impl PciDevice for VirtioPciDevice { Arc::clone(self.virtio_interrupt.as_ref().unwrap()), ) .unwrap_or_else(|err| error!("Error activating device: {err:?}")); + self.device_activated.store(true, Ordering::SeqCst); } else { debug!("Device doesn't need activation"); } From 31f09761d616f432fd2fa1dacaef3fb254df1034 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Fri, 13 Jun 2025 16:45:02 +0200 Subject: [PATCH 47/99] refactor(vm): move ResourceAllocator inside Vm ResourceAllocator object was part of DeviceManager since it is (mainly) devices that use it. ResourceAllocator is as well the object that implements (in a dummy way, for the moment) the DeviceRelocation trait which PciDevices use to move the address space of a PciDevice when triggered from the guest. Problem with DeviceRelocation is that it also needs the Vm file descriptor to perform the relocation, because we need to move register the new IO event fd for VirtIO devices. To make things simpler, move ResourceAllocator inside the Vm object. In subsequent commit we will remove the DeviceRelocation from ResourceAllocator and move it to Vm instead. This has the nice secondary effect that we were able to simplify the signature of many device-related methods that received Vm and ResourceAllocator arguments. Signed-off-by: Babis Chalios --- src/vmm/src/acpi/mod.rs | 50 ++++++++--------- src/vmm/src/arch/aarch64/fdt.rs | 9 +--- src/vmm/src/arch/x86_64/mod.rs | 11 ++-- src/vmm/src/arch/x86_64/mptable.rs | 2 +- src/vmm/src/arch/x86_64/vm.rs | 5 ++ src/vmm/src/builder.rs | 21 ++++---- src/vmm/src/device_manager/legacy.rs | 11 ++-- src/vmm/src/device_manager/mmio.rs | 48 ++++------------- src/vmm/src/device_manager/mod.rs | 54 +++++-------------- src/vmm/src/device_manager/pci_mngr.rs | 29 +++------- src/vmm/src/device_manager/persist.rs | 24 ++------- src/vmm/src/devices/acpi/vmgenid.rs | 2 +- src/vmm/src/devices/pci/pci_segment.rs | 2 +- .../devices/virtio/transport/pci/device.rs | 6 +-- src/vmm/src/lib.rs | 4 +- src/vmm/src/vstate/mod.rs | 2 + .../{device_manager => vstate}/resources.rs | 17 +++--- src/vmm/src/vstate/vm.rs | 20 +++---- 18 files changed, 113 insertions(+), 204 deletions(-) rename src/vmm/src/{device_manager => vstate}/resources.rs (96%) diff --git a/src/vmm/src/acpi/mod.rs b/src/vmm/src/acpi/mod.rs index a3e471aed9e..51711d9eb92 100644 --- a/src/vmm/src/acpi/mod.rs +++ b/src/vmm/src/acpi/mod.rs @@ -12,8 +12,8 @@ use crate::acpi::x86_64::{ }; use crate::arch::x86_64::layout; use crate::device_manager::DeviceManager; -use crate::device_manager::resources::ResourceAllocator; use crate::vstate::memory::{GuestAddress, GuestMemoryMmap}; +use crate::vstate::resources::ResourceAllocator; mod x86_64; @@ -80,7 +80,11 @@ impl AcpiTableWriter<'_> { } /// Build the DSDT table for the guest - fn build_dsdt(&mut self, device_manager: &mut DeviceManager) -> Result { + fn build_dsdt( + &mut self, + device_manager: &mut DeviceManager, + resource_allocator: &ResourceAllocator, + ) -> Result { let mut dsdt_data = Vec::new(); // Virtio-devices DSDT data @@ -99,7 +103,7 @@ impl AcpiTableWriter<'_> { setup_arch_dsdt(&mut dsdt_data)?; let mut dsdt = Dsdt::new(OEM_ID, *b"FCVMDSDT", OEM_REVISION, dsdt_data); - self.write_acpi_table(&device_manager.resource_allocator, &mut dsdt) + self.write_acpi_table(resource_allocator, &mut dsdt) } /// Build the FADT table for the guest @@ -193,26 +197,16 @@ impl AcpiTableWriter<'_> { pub(crate) fn create_acpi_tables( mem: &GuestMemoryMmap, device_manager: &mut DeviceManager, + resource_allocator: &ResourceAllocator, vcpus: &[Vcpu], ) -> Result<(), AcpiError> { let mut writer = AcpiTableWriter { mem }; - let dsdt_addr = writer.build_dsdt(device_manager)?; - - let fadt_addr = writer.build_fadt(&device_manager.resource_allocator, dsdt_addr)?; - let madt_addr = writer.build_madt( - &device_manager.resource_allocator, - vcpus.len().try_into().unwrap(), - )?; - let mcfg_addr = writer.build_mcfg( - &device_manager.resource_allocator, - layout::PCI_MMCONFIG_START, - )?; - let xsdt_addr = writer.build_xsdt( - &device_manager.resource_allocator, - fadt_addr, - madt_addr, - mcfg_addr, - )?; + let dsdt_addr = writer.build_dsdt(device_manager, resource_allocator)?; + + let fadt_addr = writer.build_fadt(resource_allocator, dsdt_addr)?; + let madt_addr = writer.build_madt(resource_allocator, vcpus.len().try_into().unwrap())?; + let mcfg_addr = writer.build_mcfg(resource_allocator, layout::PCI_MMCONFIG_START)?; + let xsdt_addr = writer.build_xsdt(resource_allocator, fadt_addr, madt_addr, mcfg_addr)?; writer.build_rsdp(xsdt_addr) } @@ -224,8 +218,8 @@ mod tests { use crate::acpi::{AcpiError, AcpiTableWriter}; use crate::arch::x86_64::layout::{SYSTEM_MEM_SIZE, SYSTEM_MEM_START}; use crate::builder::tests::default_vmm; - use crate::device_manager::resources::ResourceAllocator; use crate::utils::u64_to_usize; + use crate::vstate::resources::ResourceAllocator; use crate::vstate::vm::tests::setup_vm_with_memory; struct MockSdt(Vec); @@ -259,14 +253,14 @@ mod tests { // This should succeed let mut sdt = MockSdt(vec![0; 4096]); let addr = writer - .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START); // Let's try to write two 4K pages plus one byte let mut sdt = MockSdt(vec![0; usize::try_from(SYSTEM_MEM_SIZE + 1).unwrap()]); let err = writer - .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) .unwrap_err(); assert!( matches!( @@ -281,27 +275,27 @@ mod tests { // succeed. let mut sdt = MockSdt(vec![0; 5]); let addr = writer - .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4096); let mut sdt = MockSdt(vec![0; 2]); let addr = writer - .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4101); let mut sdt = MockSdt(vec![0; 4]); let addr = writer - .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4103); let mut sdt = MockSdt(vec![0; 8]); let addr = writer - .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4107); let mut sdt = MockSdt(vec![0; 16]); let addr = writer - .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4115); } diff --git a/src/vmm/src/arch/aarch64/fdt.rs b/src/vmm/src/arch/aarch64/fdt.rs index e22bda5583e..8ae5e764df4 100644 --- a/src/vmm/src/arch/aarch64/fdt.rs +++ b/src/vmm/src/arch/aarch64/fdt.rs @@ -536,14 +536,7 @@ mod tests { let dummy = Arc::new(Mutex::new(DummyDevice::new())); device_manager .mmio_devices - .register_virtio_test_device( - &vm, - mem.clone(), - &device_manager.resource_allocator, - dummy, - &mut cmdline, - "dummy", - ) + .register_virtio_test_device(&vm, mem.clone(), dummy, &mut cmdline, "dummy") .unwrap(); create_fdt( diff --git a/src/vmm/src/arch/x86_64/mod.rs b/src/vmm/src/arch/x86_64/mod.rs index 68b903d5ff6..5307dbdf710 100644 --- a/src/vmm/src/arch/x86_64/mod.rs +++ b/src/vmm/src/arch/x86_64/mod.rs @@ -217,7 +217,7 @@ pub fn configure_system_for_boot( // Note that this puts the mptable at the last 1k of Linux's 640k base RAM mptable::setup_mptable( vm.guest_memory(), - &device_manager.resource_allocator, + &vm.common.resource_allocator, vcpu_config.vcpu_count, ) .map_err(ConfigurationError::MpTableSetup)?; @@ -238,7 +238,12 @@ pub fn configure_system_for_boot( // Create ACPI tables and write them in guest memory // For the time being we only support ACPI in x86_64 - create_acpi_tables(vm.guest_memory(), device_manager, vcpus)?; + create_acpi_tables( + vm.guest_memory(), + device_manager, + &vm.common.resource_allocator, + vcpus, + )?; Ok(()) } @@ -568,9 +573,9 @@ mod tests { use linux_loader::loader::bootparam::boot_e820_entry; use super::*; - use crate::device_manager::resources::ResourceAllocator; use crate::test_utils::{arch_mem, single_region_mem}; use crate::utils::mib_to_bytes; + use crate::vstate::resources::ResourceAllocator; #[test] fn regions_lt_4gb() { diff --git a/src/vmm/src/arch/x86_64/mptable.rs b/src/vmm/src/arch/x86_64/mptable.rs index c397290c23e..17b2900aeb2 100644 --- a/src/vmm/src/arch/x86_64/mptable.rs +++ b/src/vmm/src/arch/x86_64/mptable.rs @@ -15,10 +15,10 @@ use vm_allocator::AllocPolicy; use crate::arch::IRQ_MAX; use crate::arch::x86_64::generated::mpspec; -use crate::device_manager::resources::ResourceAllocator; use crate::vstate::memory::{ Address, ByteValued, Bytes, GuestAddress, GuestMemory, GuestMemoryMmap, }; +use crate::vstate::resources::ResourceAllocator; // These `mpspec` wrapper types are only data, reading them from data is a safe initialization. // SAFETY: POD diff --git a/src/vmm/src/arch/x86_64/vm.rs b/src/vmm/src/arch/x86_64/vm.rs index e84b4338e35..9d22bf9a757 100644 --- a/src/vmm/src/arch/x86_64/vm.rs +++ b/src/vmm/src/arch/x86_64/vm.rs @@ -11,8 +11,10 @@ use kvm_ioctls::Cap; use serde::{Deserialize, Serialize}; use crate::arch::x86_64::msr::MsrError; +use crate::snapshot::Persist; use crate::utils::u64_to_usize; use crate::vstate::memory::{GuestMemoryExtension, GuestMemoryState}; +use crate::vstate::resources::ResourceAllocatorState; use crate::vstate::vm::{VmCommon, VmError}; /// Error type for [`Vm::restore_state`] @@ -187,6 +189,7 @@ impl ArchVm { Ok(VmState { memory: self.common.guest_memory.describe(), + resource_allocator: self.common.resource_allocator.save(), pitstate, clock, pic_master, @@ -211,6 +214,8 @@ impl ArchVm { pub struct VmState { /// guest memory state pub memory: GuestMemoryState, + /// resource allocator + pub resource_allocator: ResourceAllocatorState, pitstate: kvm_pit_state2, clock: kvm_clock_data, // TODO: rename this field to adopt inclusive language once Linux updates it, too. diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index a3c100ba83a..82299e41150 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -47,6 +47,8 @@ use crate::vmm_config::instance_info::InstanceInfo; use crate::vmm_config::machine_config::MachineConfigError; use crate::vstate::kvm::{Kvm, KvmError}; use crate::vstate::memory::GuestRegionMmap; +#[cfg(target_arch = "aarch64")] +use crate::vstate::resources::ResourceAllocator; use crate::vstate::vcpu::VcpuError; use crate::vstate::vm::{Vm, VmError}; use crate::{EventManager, Vmm, VmmError}; @@ -188,7 +190,7 @@ pub fn build_microvm_for_boot( .collect::, _>>()?; if vm_resources.pci_enabled { - device_manager.enable_pci()?; + device_manager.enable_pci(&vm)?; } else { boot_cmdline.insert("pci", "off")?; } @@ -197,7 +199,7 @@ pub fn build_microvm_for_boot( // to maintain the same MMIO address referenced in the documentation // and tests. if vm_resources.boot_timer { - device_manager.attach_boot_timer_device(request_ts)?; + device_manager.attach_boot_timer_device(&vm, request_ts)?; } if let Some(balloon) = vm_resources.balloon.get() { @@ -252,7 +254,7 @@ pub fn build_microvm_for_boot( #[cfg(target_arch = "aarch64")] if vcpus[0].kvm_vcpu.supports_pvtime() { - setup_pvtime(&mut device_manager, &mut vcpus)?; + setup_pvtime(&vm.common.resource_allocator, &mut vcpus)?; } else { log::warn!("Vcpus do not support pvtime, steal time will not be reported to guest"); } @@ -513,13 +515,12 @@ const STEALTIME_STRUCT_MEM_SIZE: u64 = 64; /// Helper method to allocate steal time region #[cfg(target_arch = "aarch64")] fn allocate_pvtime_region( - device_manager: &mut DeviceManager, + resource_allocator: &ResourceAllocator, vcpu_count: usize, policy: vm_allocator::AllocPolicy, ) -> Result { let size = STEALTIME_STRUCT_MEM_SIZE * vcpu_count as u64; - let addr = device_manager - .resource_allocator + let addr = resource_allocator .allocate_system_memory(size, STEALTIME_STRUCT_MEM_SIZE, policy) .map_err(StartMicrovmError::AllocateResources)?; Ok(GuestAddress(addr)) @@ -528,12 +529,12 @@ fn allocate_pvtime_region( /// Sets up pvtime for all vcpus #[cfg(target_arch = "aarch64")] fn setup_pvtime( - device_manager: &mut DeviceManager, + resource_allocator: &ResourceAllocator, vcpus: &mut [Vcpu], ) -> Result<(), StartMicrovmError> { // Alloc sys mem for steal time region let pvtime_mem: GuestAddress = allocate_pvtime_region( - device_manager, + resource_allocator, vcpus.len(), vm_allocator::AllocPolicy::LastMatch, )?; @@ -1141,7 +1142,9 @@ pub(crate) mod tests { let mut vmm = default_vmm(); let request_ts = TimestampUs::default(); - let res = vmm.device_manager.attach_boot_timer_device(request_ts); + let res = vmm + .device_manager + .attach_boot_timer_device(&vmm.vm, request_ts); res.unwrap(); assert!(vmm.device_manager.mmio_devices.boot_timer.is_some()); } diff --git a/src/vmm/src/device_manager/legacy.rs b/src/vmm/src/device_manager/legacy.rs index 7011ae71122..47b259ef87b 100644 --- a/src/vmm/src/device_manager/legacy.rs +++ b/src/vmm/src/device_manager/legacy.rs @@ -97,11 +97,7 @@ impl PortIODeviceManager { } /// Register supported legacy devices. - pub fn register_devices( - &mut self, - io_bus: &vm_device::Bus, - vm: &Vm, - ) -> Result<(), LegacyDeviceError> { + pub fn register_devices(&mut self, vm: &Vm) -> Result<(), LegacyDeviceError> { let serial_2_4 = Arc::new(Mutex::new(SerialDevice { serial: Serial::with_events( self.com_evt_2_4.try_clone()?.try_clone()?, @@ -122,6 +118,8 @@ impl PortIODeviceManager { ), input: None, })); + + let io_bus = &vm.common.resource_allocator.pio_bus; io_bus.insert( self.stdio_serial.clone(), Self::SERIAL_PORT_ADDRESSES[0], @@ -243,7 +241,6 @@ mod tests { #[test] fn test_register_legacy_devices() { let (_, vm) = setup_vm_with_memory(0x1000); - let io_bus = vm_device::Bus::new(); vm.setup_irqchip().unwrap(); let mut ldm = PortIODeviceManager::new( Arc::new(Mutex::new(SerialDevice { @@ -261,6 +258,6 @@ mod tests { )), ) .unwrap(); - ldm.register_devices(&io_bus, &vm).unwrap(); + ldm.register_devices(&vm).unwrap(); } } diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 2a4b0161dab..209285ae5d5 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -18,7 +18,6 @@ use log::debug; use serde::{Deserialize, Serialize}; use vm_allocator::AllocPolicy; -use super::resources::ResourceAllocator; use crate::Vm; use crate::arch::BOOT_DEVICE_MEM_START; #[cfg(target_arch = "aarch64")] @@ -30,6 +29,7 @@ use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::transport::mmio::MmioTransport; #[cfg(target_arch = "x86_64")] use crate::vstate::memory::GuestAddress; +use crate::vstate::resources::ResourceAllocator; /// Errors for MMIO device manager. #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -179,7 +179,6 @@ impl MMIODeviceManager { &mut self, vm: &Vm, device_id: String, - mmio_bus: &vm_device::Bus, device: MMIODevice, ) -> Result<(), MmioError> { // Our virtio devices are currently hardcoded to use a single IRQ. @@ -202,7 +201,7 @@ impl MMIODeviceManager { .map_err(MmioError::RegisterIrqFd)?; } - mmio_bus.insert( + vm.common.resource_allocator.mmio_bus.insert( device.inner.clone(), device.resources.addr, device.resources.len, @@ -238,13 +237,12 @@ impl MMIODeviceManager { pub fn register_mmio_virtio_for_boot( &mut self, vm: &Vm, - resource_allocator: &ResourceAllocator, device_id: String, mmio_device: MmioTransport, _cmdline: &mut kernel_cmdline::Cmdline, ) -> Result<(), MmioError> { let device = MMIODevice { - resources: self.allocate_mmio_resources(resource_allocator, 1)?, + resources: self.allocate_mmio_resources(&vm.common.resource_allocator, 1)?, inner: Arc::new(Mutex::new(mmio_device)), }; @@ -260,7 +258,7 @@ impl MMIODeviceManager { device.resources.irq.unwrap(), )?; } - self.register_mmio_virtio(vm, device_id, &resource_allocator.mmio_bus, device)?; + self.register_mmio_virtio(vm, device_id, device)?; Ok(()) } @@ -270,7 +268,6 @@ impl MMIODeviceManager { pub fn register_mmio_serial( &mut self, vm: &Vm, - resource_allocator: &ResourceAllocator, serial: Arc>, device_info_opt: Option, ) -> Result<(), MmioError> { @@ -279,7 +276,7 @@ impl MMIODeviceManager { let device_info = if let Some(device_info) = device_info_opt { device_info } else { - let gsi = resource_allocator.allocate_gsi(1)?; + let gsi = vm.common.resource_allocator.allocate_gsi(1)?; MMIODeviceInfo { addr: SERIAL_MEM_START, len: MMIO_LEN, @@ -298,7 +295,7 @@ impl MMIODeviceManager { inner: serial, }; - resource_allocator.mmio_bus.insert( + vm.common.resource_allocator.mmio_bus.insert( device.inner.clone(), device.resources.addr, device.resources.len, @@ -480,20 +477,13 @@ pub(crate) mod tests { &mut self, vm: &Vm, guest_mem: GuestMemoryMmap, - resource_allocator: &ResourceAllocator, device: Arc>, cmdline: &mut kernel_cmdline::Cmdline, dev_id: &str, ) -> Result { let interrupt = Arc::new(IrqTrigger::new()); let mmio_device = MmioTransport::new(guest_mem, interrupt, device.clone(), false); - self.register_mmio_virtio_for_boot( - vm, - resource_allocator, - dev_id.to_string(), - mmio_device, - cmdline, - )?; + self.register_mmio_virtio_for_boot(vm, dev_id.to_string(), mmio_device, cmdline)?; Ok(self .get_virtio_device(device.lock().unwrap().device_type(), dev_id) .unwrap() @@ -600,7 +590,6 @@ pub(crate) mod tests { let mut vm = Vm::new(&kvm).unwrap(); vm.register_memory_regions(guest_mem).unwrap(); let mut device_manager = MMIODeviceManager::new(); - let resource_allocator = ResourceAllocator::new().unwrap(); let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); let dummy = Arc::new(Mutex::new(DummyDevice::new())); @@ -613,7 +602,6 @@ pub(crate) mod tests { .register_virtio_test_device( &vm, vm.guest_memory().clone(), - &resource_allocator, dummy, &mut cmdline, "dummy", @@ -648,7 +636,6 @@ pub(crate) mod tests { let mut vm = Vm::new(&kvm).unwrap(); vm.register_memory_regions(guest_mem).unwrap(); let mut device_manager = MMIODeviceManager::new(); - let resource_allocator = ResourceAllocator::new().unwrap(); let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); #[cfg(target_arch = "x86_64")] @@ -661,7 +648,6 @@ pub(crate) mod tests { .register_virtio_test_device( &vm, vm.guest_memory().clone(), - &resource_allocator, Arc::new(Mutex::new(DummyDevice::new())), &mut cmdline, "dummy1", @@ -675,7 +661,6 @@ pub(crate) mod tests { .register_virtio_test_device( &vm, vm.guest_memory().clone(), - &resource_allocator, Arc::new(Mutex::new(DummyDevice::new())), &mut cmdline, "dummy2" @@ -710,21 +695,13 @@ pub(crate) mod tests { vm.setup_irqchip(1).unwrap(); let mut device_manager = MMIODeviceManager::new(); - let resource_allocator = ResourceAllocator::new().unwrap(); let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); let dummy = Arc::new(Mutex::new(DummyDevice::new())); let type_id = dummy.lock().unwrap().device_type(); let id = String::from("foo"); let addr = device_manager - .register_virtio_test_device( - &vm, - vm.guest_memory().clone(), - &resource_allocator, - dummy, - &mut cmdline, - &id, - ) + .register_virtio_test_device(&vm, vm.guest_memory().clone(), dummy, &mut cmdline, &id) .unwrap(); assert!(device_manager.get_virtio_device(type_id, &id).is_some()); assert_eq!( @@ -747,14 +724,7 @@ pub(crate) mod tests { let dummy2 = Arc::new(Mutex::new(DummyDevice::new())); let id2 = String::from("foo2"); device_manager - .register_virtio_test_device( - &vm, - vm.guest_memory().clone(), - &resource_allocator, - dummy2, - &mut cmdline, - &id2, - ) + .register_virtio_test_device(&vm, vm.guest_memory().clone(), dummy2, &mut cmdline, &id2) .unwrap(); let mut count = 0; diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index 260f3337673..f037a4a8d05 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -18,7 +18,6 @@ use log::{error, info}; use mmio::{MMIODeviceManager, MmioError}; use pci_mngr::{PciDevices, PciDevicesConstructorArgs, PciManagerError}; use persist::{ACPIDeviceManagerConstructorArgs, MMIODevManagerConstructorArgs}; -use resources::ResourceAllocator; use serde::{Deserialize, Serialize}; use utils::time::TimestampUs; use vmm_sys_util::eventfd::EventFd; @@ -54,8 +53,6 @@ pub mod mmio; pub mod pci_mngr; /// Device managers (de)serialization support. pub mod persist; -/// Resource manager for devices. -pub mod resources; #[derive(Debug, thiserror::Error, displaydoc::Display)] /// Error while creating a new [`DeviceManager`] @@ -93,8 +90,6 @@ pub enum AttachDeviceError { #[derive(Debug)] /// A manager of all peripheral devices of Firecracker pub struct DeviceManager { - /// Allocator for system memory and interrupt numbers - pub resource_allocator: Arc, /// MMIO devices pub mmio_devices: MMIODeviceManager, #[cfg(target_arch = "x86_64")] @@ -139,7 +134,6 @@ impl DeviceManager { event_manager: &mut EventManager, vcpus_exit_evt: &EventFd, vm: &Vm, - resource_allocator: &ResourceAllocator, ) -> Result { Self::set_stdout_nonblocking(); @@ -153,7 +147,7 @@ impl DeviceManager { // create pio dev manager with legacy devices let mut legacy_devices = PortIODeviceManager::new(serial, i8042)?; - legacy_devices.register_devices(&resource_allocator.pio_bus, vm)?; + legacy_devices.register_devices(vm)?; Ok(legacy_devices) } @@ -163,13 +157,10 @@ impl DeviceManager { vcpus_exit_evt: &EventFd, vm: &Vm, ) -> Result { - let resource_allocator = Arc::new(ResourceAllocator::new()?); #[cfg(target_arch = "x86_64")] - let legacy_devices = - Self::create_legacy_devices(event_manager, vcpus_exit_evt, vm, &resource_allocator)?; + let legacy_devices = Self::create_legacy_devices(event_manager, vcpus_exit_evt, vm)?; Ok(DeviceManager { - resource_allocator, mmio_devices: MMIODeviceManager::new(), #[cfg(target_arch = "x86_64")] legacy_devices, @@ -193,13 +184,8 @@ impl DeviceManager { // The device mutex mustn't be locked here otherwise it will deadlock. let device = MmioTransport::new(vm.guest_memory().clone(), interrupt, device, is_vhost_user); - self.mmio_devices.register_mmio_virtio_for_boot( - vm, - &self.resource_allocator, - id, - device, - cmdline, - )?; + self.mmio_devices + .register_mmio_virtio_for_boot(vm, id, device, cmdline)?; Ok(()) } @@ -214,8 +200,7 @@ impl DeviceManager { is_vhost_user: bool, ) -> Result<(), AttachDeviceError> { if self.pci_devices.pci_segment.is_some() { - self.pci_devices - .attach_pci_virtio_device(vm, &self.resource_allocator, id, device)?; + self.pci_devices.attach_pci_virtio_device(vm, id, device)?; } else { self.attach_mmio_virtio_device(vm, id, device, cmdline, is_vhost_user)?; } @@ -226,12 +211,13 @@ impl DeviceManager { /// Attaches a [`BootTimer`] to the VM pub(crate) fn attach_boot_timer_device( &mut self, + vm: &Vm, request_ts: TimestampUs, ) -> Result<(), AttachDeviceError> { let boot_timer = Arc::new(Mutex::new(BootTimer::new(request_ts))); self.mmio_devices - .register_mmio_boot_timer(&self.resource_allocator.mmio_bus, boot_timer)?; + .register_mmio_boot_timer(&vm.common.resource_allocator.mmio_bus, boot_timer)?; Ok(()) } @@ -241,7 +227,7 @@ impl DeviceManager { mem: &GuestMemoryMmap, vm: &Vm, ) -> Result<(), AttachDeviceError> { - let vmgenid = VmGenId::new(mem, &self.resource_allocator)?; + let vmgenid = VmGenId::new(mem, &vm.common.resource_allocator)?; self.acpi_devices.attach_vmgenid(vmgenid, vm)?; Ok(()) } @@ -265,21 +251,19 @@ impl DeviceManager { // Make stdout non-blocking. Self::set_stdout_nonblocking(); let serial = Self::setup_serial_device(event_manager)?; - self.mmio_devices - .register_mmio_serial(vm, &self.resource_allocator, serial, None)?; + self.mmio_devices.register_mmio_serial(vm, serial, None)?; self.mmio_devices.add_mmio_serial_to_cmdline(cmdline)?; } let rtc = Arc::new(Mutex::new(RTCDevice::new())); self.mmio_devices - .register_mmio_rtc(&self.resource_allocator, rtc, None)?; + .register_mmio_rtc(&vm.common.resource_allocator, rtc, None)?; Ok(()) } /// Enables PCIe support for Firecracker devices - pub fn enable_pci(&mut self) -> Result<(), PciManagerError> { - self.pci_devices - .attach_pci_segment(&self.resource_allocator) + pub fn enable_pci(&mut self, vm: &Arc) -> Result<(), PciManagerError> { + self.pci_devices.attach_pci_segment(vm) } fn do_kick_device(virtio_device: Arc>) { @@ -397,8 +381,6 @@ impl DeviceManager { #[derive(Debug, Default, Clone, Serialize, Deserialize)] /// State of devices in the system pub struct DevicesState { - /// Resource allocator state - pub resource_allocator_state: resources::ResourceAllocatorState, /// MMIO devices state pub mmio_state: persist::DeviceStates, /// ACPI devices state @@ -454,7 +436,6 @@ impl<'a> Persist<'a> for DeviceManager { fn save(&self) -> Self::State { DevicesState { - resource_allocator_state: self.resource_allocator.save(), mmio_state: self.mmio_devices.save(), acpi_state: self.acpi_devices.save(), pci_state: self.pci_devices.save(), @@ -465,17 +446,12 @@ impl<'a> Persist<'a> for DeviceManager { constructor_args: Self::ConstructorArgs, state: &Self::State, ) -> std::result::Result { - // Safe to unwrap here. ResourceAllocator restoring cannot fail. - let resource_allocator = - Arc::new(ResourceAllocator::restore((), &state.resource_allocator_state).unwrap()); - // Setup legacy devices in case of x86 #[cfg(target_arch = "x86_64")] let legacy_devices = Self::create_legacy_devices( constructor_args.event_manager, constructor_args.vcpus_exit_evt, constructor_args.vm, - &resource_allocator, )?; // Restore MMIO devices @@ -483,7 +459,6 @@ impl<'a> Persist<'a> for DeviceManager { mem: constructor_args.mem, vm: constructor_args.vm, event_manager: constructor_args.event_manager, - resource_allocator: &resource_allocator, vm_resources: constructor_args.vm_resources, instance_id: constructor_args.instance_id, restored_from_file: constructor_args.restored_from_file, @@ -493,7 +468,6 @@ impl<'a> Persist<'a> for DeviceManager { // Restore ACPI devices let acpi_ctor_args = ACPIDeviceManagerConstructorArgs { mem: constructor_args.mem, - resource_allocator: &resource_allocator, vm: constructor_args.vm, }; let mut acpi_devices = ACPIDeviceManager::restore(acpi_ctor_args, &state.acpi_state)?; @@ -501,7 +475,6 @@ impl<'a> Persist<'a> for DeviceManager { // Restore PCI devices let pci_ctor_args = PciDevicesConstructorArgs { - resource_allocator: &resource_allocator, vm: constructor_args.vm.clone(), mem: constructor_args.mem, vm_resources: constructor_args.vm_resources, @@ -512,7 +485,6 @@ impl<'a> Persist<'a> for DeviceManager { let pci_devices = PciDevices::restore(pci_ctor_args, &state.pci_state)?; let device_manager = DeviceManager { - resource_allocator, mmio_devices, #[cfg(target_arch = "x86_64")] legacy_devices, @@ -578,7 +550,6 @@ pub(crate) mod tests { let mmio_devices = MMIODeviceManager::new(); let acpi_devices = ACPIDeviceManager::new(); let pci_devices = PciDevices::new(); - let resource_allocator = Arc::new(ResourceAllocator::new().unwrap()); #[cfg(target_arch = "x86_64")] let legacy_devices = PortIODeviceManager::new( @@ -592,7 +563,6 @@ pub(crate) mod tests { .unwrap(); DeviceManager { - resource_allocator, mmio_devices, #[cfg(target_arch = "x86_64")] legacy_devices, diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs index d2c8ae27528..e5dde833db2 100644 --- a/src/vmm/src/device_manager/pci_mngr.rs +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -12,7 +12,6 @@ use serde::{Deserialize, Serialize}; use vm_device::BusError; use super::persist::{MmdsState, SharedDeviceType}; -use crate::device_manager::resources::ResourceAllocator; use crate::devices::pci::PciSegment; use crate::devices::virtio::balloon::Balloon; use crate::devices::virtio::balloon::persist::{BalloonConstructorArgs, BalloonState}; @@ -35,6 +34,7 @@ use crate::resources::VmResources; use crate::snapshot::Persist; use crate::vmm_config::mmds::MmdsConfigError; use crate::vstate::memory::GuestMemoryMmap; +use crate::vstate::resources::ResourceAllocator; use crate::vstate::vm::{InterruptError, MsiVectorGroup}; use crate::{EventManager, Vm}; @@ -71,17 +71,14 @@ impl PciDevices { Default::default() } - pub fn attach_pci_segment( - &mut self, - resource_allocator: &Arc, - ) -> Result<(), PciManagerError> { + pub fn attach_pci_segment(&mut self, vm: &Arc) -> Result<(), PciManagerError> { // We only support a single PCIe segment. Calling this function twice is a Firecracker // internal error. assert!(self.pci_segment.is_none()); // Currently we don't assign any IRQs to PCI devices. We will be using MSI-X interrupts // only. - let pci_segment = PciSegment::new(0, resource_allocator, &[0u8; 32])?; + let pci_segment = PciSegment::new(0, &vm.common.resource_allocator, &[0u8; 32])?; self.pci_segment = Some(pci_segment); Ok(()) @@ -131,7 +128,6 @@ impl PciDevices { >( &mut self, vm: &Arc, - resource_allocator: &ResourceAllocator, id: String, device: Arc>, ) -> Result<(), PciManagerError> { @@ -140,17 +136,14 @@ impl PciDevices { let pci_device_bdf = pci_segment.next_device_bdf()?; debug!("Allocating BDF: {pci_device_bdf:?} for device"); let mem = vm.guest_memory().clone(); + let resource_allocator = &vm.common.resource_allocator; let device_type: u32 = device.lock().expect("Poisoned lock").device_type(); // Allocate one MSI vector per queue, plus one for configuration let msix_num = u16::try_from(device.lock().expect("Poisoned lock").queues().len() + 1).unwrap(); - let msix_vectors = Arc::new(Vm::create_msix_group( - vm.clone(), - resource_allocator, - msix_num, - )?); + let msix_vectors = Arc::new(Vm::create_msix_group(vm.clone(), msix_num)?); // Create the transport let mut virtio_device = @@ -190,7 +183,6 @@ impl PciDevices { fn restore_pci_device( &mut self, vm: &Arc, - resource_allocator: &ResourceAllocator, device: Arc>, device_id: &str, transport_state: &VirtioPciDeviceState, @@ -224,7 +216,7 @@ impl PciDevices { self.virtio_devices .insert((device_type, device_id.to_string()), virtio_device.clone()); - Self::register_bars_with_bus(resource_allocator, &virtio_device)?; + Self::register_bars_with_bus(&vm.common.resource_allocator, &virtio_device)?; virtio_device .lock() .expect("Poisoned lock") @@ -279,7 +271,6 @@ pub struct PciDevicesState { pub struct PciDevicesConstructorArgs<'a> { pub vm: Arc, pub mem: &'a GuestMemoryMmap, - pub resource_allocator: &'a Arc, pub vm_resources: &'a mut VmResources, pub instance_id: &'a str, pub restored_from_file: bool, @@ -291,7 +282,6 @@ impl<'a> Debug for PciDevicesConstructorArgs<'a> { f.debug_struct("PciDevicesConstructorArgs") .field("vm", &self.vm) .field("mem", &self.mem) - .field("resource_allocator", &self.resource_allocator) .field("vm_resources", &self.vm_resources) .field("instance_id", &self.instance_id) .field("restored_from_file", &self.restored_from_file) @@ -441,7 +431,7 @@ impl<'a> Persist<'a> for PciDevices { return Ok(pci_devices); } - pci_devices.attach_pci_segment(constructor_args.resource_allocator)?; + pci_devices.attach_pci_segment(&constructor_args.vm)?; if let Some(balloon_state) = &state.balloon_device { let device = Arc::new(Mutex::new( @@ -463,7 +453,6 @@ impl<'a> Persist<'a> for PciDevices { pci_devices .restore_pci_device( &constructor_args.vm, - constructor_args.resource_allocator, device, &balloon_state.device_id, &balloon_state.transport_state, @@ -489,7 +478,6 @@ impl<'a> Persist<'a> for PciDevices { pci_devices .restore_pci_device( &constructor_args.vm, - constructor_args.resource_allocator, device, &block_state.device_id, &block_state.transport_state, @@ -540,7 +528,6 @@ impl<'a> Persist<'a> for PciDevices { pci_devices .restore_pci_device( &constructor_args.vm, - constructor_args.resource_allocator, device, &net_state.device_id, &net_state.transport_state, @@ -574,7 +561,6 @@ impl<'a> Persist<'a> for PciDevices { pci_devices .restore_pci_device( &constructor_args.vm, - constructor_args.resource_allocator, device, &vsock_state.device_id, &vsock_state.transport_state, @@ -598,7 +584,6 @@ impl<'a> Persist<'a> for PciDevices { pci_devices .restore_pci_device( &constructor_args.vm, - constructor_args.resource_allocator, device, &entropy_state.device_id, &entropy_state.transport_state, diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index db38bb4f4ca..1273de48ba8 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -12,7 +12,6 @@ use serde::{Deserialize, Serialize}; use super::acpi::ACPIDeviceManager; use super::mmio::*; -use super::resources::ResourceAllocator; #[cfg(target_arch = "aarch64")] use crate::arch::DeviceType; use crate::devices::acpi::vmgenid::{VMGenIDState, VMGenIdConstructorArgs, VmGenId, VmGenIdError}; @@ -146,7 +145,6 @@ pub struct MMIODevManagerConstructorArgs<'a> { pub mem: &'a GuestMemoryMmap, pub vm: &'a Vm, pub event_manager: &'a mut EventManager, - pub resource_allocator: &'a ResourceAllocator, pub vm_resources: &'a mut VmResources, pub instance_id: &'a str, pub restored_from_file: bool, @@ -172,7 +170,6 @@ pub struct ACPIDeviceManagerState { #[derive(Debug)] pub struct ACPIDeviceManagerConstructorArgs<'a> { pub mem: &'a GuestMemoryMmap, - pub resource_allocator: &'a ResourceAllocator, pub vm: &'a Vm, } @@ -204,7 +201,7 @@ impl<'a> Persist<'a> for ACPIDeviceManager { let vmgenid = VmGenId::restore( VMGenIdConstructorArgs { mem: constructor_args.mem, - resource_allocator: constructor_args.resource_allocator, + resource_allocator: &constructor_args.vm.common.resource_allocator, }, vmgenid_args, )?; @@ -369,17 +366,12 @@ impl<'a> Persist<'a> for MMIODeviceManager { .event_manager .add_subscriber(serial.clone()); - dev_manager.register_mmio_serial( - vm, - constructor_args.resource_allocator, - serial, - Some(state.device_info), - )?; + dev_manager.register_mmio_serial(vm, serial, Some(state.device_info))?; } if state.type_ == DeviceType::Rtc { let rtc = Arc::new(Mutex::new(RTCDevice::new())); dev_manager.register_mmio_rtc( - constructor_args.resource_allocator, + &constructor_args.vm.common.resource_allocator, rtc, Some(state.device_info), )?; @@ -394,7 +386,6 @@ impl<'a> Persist<'a> for MMIODeviceManager { id: &String, state: &MmioTransportState, device_info: &MMIODeviceInfo, - mmio_bus: &vm_device::Bus, event_manager: &mut EventManager| -> Result<(), Self::Error> { let interrupt = Arc::new(IrqTrigger::new()); @@ -412,7 +403,6 @@ impl<'a> Persist<'a> for MMIODeviceManager { dev_manager.register_mmio_virtio( vm, id.clone(), - mmio_bus, MMIODevice { resources: *device_info, inner: mmio_transport, @@ -451,7 +441,6 @@ impl<'a> Persist<'a> for MMIODeviceManager { &balloon_state.device_id, &balloon_state.transport_state, &balloon_state.device_info, - &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, )?; } @@ -474,7 +463,6 @@ impl<'a> Persist<'a> for MMIODeviceManager { &block_state.device_id, &block_state.transport_state, &block_state.device_info, - &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, )?; } @@ -514,7 +502,6 @@ impl<'a> Persist<'a> for MMIODeviceManager { &net_state.device_id, &net_state.transport_state, &net_state.device_info, - &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, )?; } @@ -544,7 +531,6 @@ impl<'a> Persist<'a> for MMIODeviceManager { &vsock_state.device_id, &vsock_state.transport_state, &vsock_state.device_info, - &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, )?; } @@ -569,7 +555,6 @@ impl<'a> Persist<'a> for MMIODeviceManager { &entropy_state.device_id, &entropy_state.transport_state, &entropy_state.device_info, - &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, )?; } @@ -700,14 +685,11 @@ mod tests { let vmm = default_vmm(); let device_manager_state: device_manager::DevicesState = Snapshot::deserialize(&mut buf.as_slice()).unwrap(); - let resource_allocator = - ResourceAllocator::restore((), &device_manager_state.resource_allocator_state).unwrap(); let vm_resources = &mut VmResources::default(); let restore_args = MMIODevManagerConstructorArgs { mem: vmm.vm.guest_memory(), vm: &vmm.vm, event_manager: &mut event_manager, - resource_allocator: &resource_allocator, vm_resources, instance_id: "microvm-id", restored_from_file: true, diff --git a/src/vmm/src/devices/acpi/vmgenid.rs b/src/vmm/src/devices/acpi/vmgenid.rs index 0cf0ae0d7b1..5c8d4ecbc51 100644 --- a/src/vmm/src/devices/acpi/vmgenid.rs +++ b/src/vmm/src/devices/acpi/vmgenid.rs @@ -11,9 +11,9 @@ use vm_superio::Trigger; use vmm_sys_util::eventfd::EventFd; use super::super::legacy::EventFdTrigger; -use crate::device_manager::resources::ResourceAllocator; use crate::snapshot::Persist; use crate::vstate::memory::{Bytes, GuestMemoryMmap}; +use crate::vstate::resources::ResourceAllocator; /// Bytes of memory we allocate for VMGenID device pub const VMGENID_MEM_SIZE: u64 = 16; diff --git a/src/vmm/src/devices/pci/pci_segment.rs b/src/vmm/src/devices/pci/pci_segment.rs index c1e8bb07cb8..e957332bb0e 100644 --- a/src/vmm/src/devices/pci/pci_segment.rs +++ b/src/vmm/src/devices/pci/pci_segment.rs @@ -22,7 +22,7 @@ use vm_allocator::AddressAllocator; use vm_device::{BusDeviceSync, BusError}; use crate::arch::{PCI_MMCONFIG_START, PCI_MMIO_CONFIG_SIZE_PER_SEGMENT}; -use crate::device_manager::resources::ResourceAllocator; +use crate::vstate::resources::ResourceAllocator; pub struct PciSegment { pub(crate) id: u16, diff --git a/src/vmm/src/devices/virtio/transport/pci/device.rs b/src/vmm/src/devices/virtio/transport/pci/device.rs index 6793d502f00..384ad0358dd 100644 --- a/src/vmm/src/devices/virtio/transport/pci/device.rs +++ b/src/vmm/src/devices/virtio/transport/pci/device.rs @@ -33,7 +33,6 @@ use vmm_sys_util::errno; use vmm_sys_util::eventfd::EventFd; use crate::Vm; -use crate::device_manager::resources::ResourceAllocator; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::queue::Queue; use crate::devices::virtio::transport::pci::common_config::{ @@ -45,6 +44,7 @@ use crate::logger::{debug, error}; use crate::snapshot::Persist; use crate::utils::u64_to_usize; use crate::vstate::memory::GuestMemoryMmap; +use crate::vstate::resources::ResourceAllocator; use crate::vstate::vm::{InterruptError, MsiVectorGroup}; const DEVICE_INIT: u8 = 0x00; @@ -1153,7 +1153,7 @@ mod tests { #[test] fn test_pci_device_config() { let mut vmm = default_vmm(); - vmm.device_manager.enable_pci(); + vmm.device_manager.enable_pci(&vmm.vm); let entropy = Arc::new(Mutex::new(Entropy::new(RateLimiter::default()).unwrap())); vmm.device_manager .attach_virtio_device( @@ -1271,7 +1271,7 @@ mod tests { #[test] fn test_reading_bars() { let mut vmm = default_vmm(); - vmm.device_manager.enable_pci(); + vmm.device_manager.enable_pci(&vmm.vm); let entropy = Arc::new(Mutex::new(Entropy::new(RateLimiter::default()).unwrap())); vmm.device_manager .attach_virtio_device( diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 77d0e6c85b8..2b558d566d4 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -371,10 +371,10 @@ impl Vmm { self.vcpus_handles.reserve(vcpu_count); for mut vcpu in vcpus.drain(..) { - vcpu.set_mmio_bus(self.device_manager.resource_allocator.mmio_bus.clone()); + vcpu.set_mmio_bus(self.vm.common.resource_allocator.mmio_bus.clone()); #[cfg(target_arch = "x86_64")] vcpu.kvm_vcpu - .set_pio_bus(self.device_manager.resource_allocator.pio_bus.clone()); + .set_pio_bus(self.vm.common.resource_allocator.pio_bus.clone()); self.vcpus_handles .push(vcpu.start_threaded(vcpu_seccomp_filter.clone(), barrier.clone())?); diff --git a/src/vmm/src/vstate/mod.rs b/src/vmm/src/vstate/mod.rs index 47458835e04..f4fa25914d0 100644 --- a/src/vmm/src/vstate/mod.rs +++ b/src/vmm/src/vstate/mod.rs @@ -5,6 +5,8 @@ pub mod kvm; /// Module with GuestMemory implementation. pub mod memory; +/// Resource manager for devices. +pub mod resources; /// Module with Vcpu implementation. pub mod vcpu; /// Module with Vm implementation. diff --git a/src/vmm/src/device_manager/resources.rs b/src/vmm/src/vstate/resources.rs similarity index 96% rename from src/vmm/src/device_manager/resources.rs rename to src/vmm/src/vstate/resources.rs index 2481d13b37e..91522de4cb1 100644 --- a/src/vmm/src/device_manager/resources.rs +++ b/src/vmm/src/vstate/resources.rs @@ -22,13 +22,13 @@ use crate::snapshot::Persist; /// * Memory allocations in the MMIO address space #[derive(Debug)] pub struct ResourceAllocator { - // Allocator for device interrupt lines + /// Allocator for device interrupt lines pub gsi_allocator: Arc>, - // Allocator for memory in the 32-bit MMIO address space + /// Allocator for memory in the 32-bit MMIO address space pub mmio32_memory: Arc>, - // Allocator for memory in the 64-bit MMIO address space + /// Allocator for memory in the 64-bit MMIO address space pub mmio64_memory: Arc>, - // Memory allocator for system data + /// Memory allocator for system data pub system_memory: Arc>, /// MMIO bus pub mmio_bus: Arc, @@ -186,14 +186,15 @@ impl<'a> Persist<'a> for ResourceAllocator { } #[derive(Debug, Clone, Serialize, Deserialize)] +/// State of a ResourceAllocator pub struct ResourceAllocatorState { - // Allocator for device interrupt lines + /// Allocator for device interrupt lines pub gsi_allocator: Arc>, - // Allocator for memory in the 32-bit MMIO address space + /// Allocator for memory in the 32-bit MMIO address space pub mmio32_memory: Arc>, - // Allocator for memory in the 64-bit MMIO address space + /// Allocator for memory in the 64-bit MMIO address space pub mmio64_memory: Arc>, - // Memory allocator for system data + /// Memory allocator for system data pub system_memory: Arc>, } diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index 6d5a86f76ed..c19c7c26bd4 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -29,7 +29,6 @@ use vmm_sys_util::eventfd::EventFd; use crate::arch::host_page_size; pub use crate::arch::{ArchVm as Vm, ArchVmError, VmState}; -use crate::device_manager::resources::ResourceAllocator; use crate::logger::info; use crate::persist::CreateSnapshotError; use crate::snapshot::Persist; @@ -38,6 +37,7 @@ use crate::vmm_config::snapshot::SnapshotType; use crate::vstate::memory::{ Address, GuestMemory, GuestMemoryExtension, GuestMemoryMmap, GuestMemoryRegion, GuestRegionMmap, }; +use crate::vstate::resources::ResourceAllocator; use crate::vstate::vcpu::VcpuError; use crate::{DirtyBitmap, Vcpu, mem_size_mib}; @@ -245,6 +245,8 @@ pub struct VmCommon { pub guest_memory: GuestMemoryMmap, /// Interrupts used by Vm's devices pub interrupts: Mutex>, + /// Allocator for VM resources + pub resource_allocator: Arc, } /// Errors associated with the wrappers over KVM ioctls. @@ -270,6 +272,8 @@ pub enum VmError { VmMemory(#[from] vm_memory::Error), /// Error calling mincore: {0} Mincore(vmm_sys_util::errno::Error), + /// ResourceAllocator error: {0} + ResourceAllocator(#[from] vm_allocator::Error) } /// Contains Vm functions that are usable across CPU architectures @@ -317,6 +321,7 @@ impl Vm { max_memslots: kvm.max_nr_memslots(), guest_memory: GuestMemoryMmap::default(), interrupts: Mutex::new(HashMap::new()), + resource_allocator: Arc::new(ResourceAllocator::new()?), }) } @@ -574,14 +579,12 @@ impl Vm { } /// Create a group of MSI-X interrupts - pub fn create_msix_group( - vm: Arc, - resource_allocator: &ResourceAllocator, - count: u16, - ) -> Result { + pub fn create_msix_group(vm: Arc, count: u16) -> Result { debug!("Creating new MSI group with {count} vectors"); let mut irq_routes = HashMap::with_capacity(count as usize); - for (gsi, i) in resource_allocator + for (gsi, i) in vm + .common + .resource_allocator .allocate_gsi(count as u32)? .iter() .zip(0u32..) @@ -772,8 +775,7 @@ pub(crate) mod tests { } fn create_msix_group(vm: &Arc) -> MsiVectorGroup { - let resource_allocator = ResourceAllocator::new().unwrap(); - Vm::create_msix_group(vm.clone(), &resource_allocator, 4).unwrap() + Vm::create_msix_group(vm.clone(), 4).unwrap() } #[test] From d3f6e4c52581db85bd6db17c92ef2d00d3259b2e Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Fri, 13 Jun 2025 17:09:46 +0200 Subject: [PATCH 48/99] refactor(vm): move `Bus` objects to Vm We had previously added MMIO and Port IO buses inside ResourceAllocator so that we could implement DeviceRelocation for the type. Now, we will delegate device relocation responsibilities to ArchVm instead. That is because device relocation requires access to the Vm file descriptor as well. As a result, we can move buses to the Vm object itself. Add MMIO bus to VmCommon as both architectures use it. Add PortIO bus for x86 architecture only. Not that we don't still support DeviceRelocation. VirtIO devices should not request us to relocate them. Also, for adding such support we would need to also support VirtIO reset. We will look into adding this functionaliyt later on. Signed-off-by: Babis Chalios --- src/vmm/src/arch/x86_64/vm.rs | 6 +++ src/vmm/src/device_manager/legacy.rs | 2 +- src/vmm/src/device_manager/mmio.rs | 10 ++--- src/vmm/src/device_manager/mod.rs | 5 +-- src/vmm/src/device_manager/pci_mngr.rs | 24 ++++------ src/vmm/src/device_manager/persist.rs | 2 +- src/vmm/src/devices/pci/pci_segment.rs | 61 ++++++++++++-------------- src/vmm/src/lib.rs | 5 +-- src/vmm/src/vstate/resources.rs | 26 ----------- src/vmm/src/vstate/vm.rs | 20 ++++++++- 10 files changed, 73 insertions(+), 88 deletions(-) diff --git a/src/vmm/src/arch/x86_64/vm.rs b/src/vmm/src/arch/x86_64/vm.rs index 9d22bf9a757..fbc27c82a60 100644 --- a/src/vmm/src/arch/x86_64/vm.rs +++ b/src/vmm/src/arch/x86_64/vm.rs @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use std::fmt; +use std::sync::Arc; use kvm_bindings::{ KVM_CLOCK_TSC_STABLE, KVM_IRQCHIP_IOAPIC, KVM_IRQCHIP_PIC_MASTER, KVM_IRQCHIP_PIC_SLAVE, @@ -58,6 +59,8 @@ pub struct ArchVm { /// /// `None` if `KVM_CAP_XSAVE2` not supported. xsave2_size: Option, + /// Port IO bus + pub pio_bus: Arc, } impl ArchVm { @@ -92,10 +95,13 @@ impl ArchVm { .set_tss_address(u64_to_usize(crate::arch::x86_64::layout::KVM_TSS_ADDRESS)) .map_err(ArchVmError::SetTssAddress)?; + let pio_bus = Arc::new(vm_device::Bus::new()); + Ok(ArchVm { common, msrs_to_save, xsave2_size, + pio_bus, }) } diff --git a/src/vmm/src/device_manager/legacy.rs b/src/vmm/src/device_manager/legacy.rs index 47b259ef87b..d0194e24e62 100644 --- a/src/vmm/src/device_manager/legacy.rs +++ b/src/vmm/src/device_manager/legacy.rs @@ -119,7 +119,7 @@ impl PortIODeviceManager { input: None, })); - let io_bus = &vm.common.resource_allocator.pio_bus; + let io_bus = &vm.pio_bus; io_bus.insert( self.stdio_serial.clone(), Self::SERIAL_PORT_ADDRESSES[0], diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 209285ae5d5..6b3ea95a5b5 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -201,7 +201,7 @@ impl MMIODeviceManager { .map_err(MmioError::RegisterIrqFd)?; } - vm.common.resource_allocator.mmio_bus.insert( + vm.common.mmio_bus.insert( device.inner.clone(), device.resources.addr, device.resources.len, @@ -295,7 +295,7 @@ impl MMIODeviceManager { inner: serial, }; - vm.common.resource_allocator.mmio_bus.insert( + vm.common.mmio_bus.insert( device.inner.clone(), device.resources.addr, device.resources.len, @@ -326,7 +326,7 @@ impl MMIODeviceManager { /// given as parameter, otherwise allocate a new MMIO resources for it. pub fn register_mmio_rtc( &mut self, - resource_allocator: &ResourceAllocator, + vm: &Vm, rtc: Arc>, device_info_opt: Option, ) -> Result<(), MmioError> { @@ -335,7 +335,7 @@ impl MMIODeviceManager { let device_info = if let Some(device_info) = device_info_opt { device_info } else { - let gsi = resource_allocator.allocate_gsi(1)?; + let gsi = vm.common.resource_allocator.allocate_gsi(1)?; MMIODeviceInfo { addr: RTC_MEM_START, len: MMIO_LEN, @@ -348,7 +348,7 @@ impl MMIODeviceManager { inner: rtc, }; - resource_allocator.mmio_bus.insert( + vm.common.mmio_bus.insert( device.inner.clone(), device.resources.addr, device.resources.len, diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index f037a4a8d05..c641a1aac0e 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -217,7 +217,7 @@ impl DeviceManager { let boot_timer = Arc::new(Mutex::new(BootTimer::new(request_ts))); self.mmio_devices - .register_mmio_boot_timer(&vm.common.resource_allocator.mmio_bus, boot_timer)?; + .register_mmio_boot_timer(&vm.common.mmio_bus, boot_timer)?; Ok(()) } @@ -256,8 +256,7 @@ impl DeviceManager { } let rtc = Arc::new(Mutex::new(RTCDevice::new())); - self.mmio_devices - .register_mmio_rtc(&vm.common.resource_allocator, rtc, None)?; + self.mmio_devices.register_mmio_rtc(vm, rtc, None)?; Ok(()) } diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs index e5dde833db2..3deefc946eb 100644 --- a/src/vmm/src/device_manager/pci_mngr.rs +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -34,7 +34,6 @@ use crate::resources::VmResources; use crate::snapshot::Persist; use crate::vmm_config::mmds::MmdsConfigError; use crate::vstate::memory::GuestMemoryMmap; -use crate::vstate::resources::ResourceAllocator; use crate::vstate::vm::{InterruptError, MsiVectorGroup}; use crate::{EventManager, Vm}; @@ -78,14 +77,14 @@ impl PciDevices { // Currently we don't assign any IRQs to PCI devices. We will be using MSI-X interrupts // only. - let pci_segment = PciSegment::new(0, &vm.common.resource_allocator, &[0u8; 32])?; + let pci_segment = PciSegment::new(0, vm, &[0u8; 32])?; self.pci_segment = Some(pci_segment); Ok(()) } fn register_bars_with_bus( - resource_allocator: &ResourceAllocator, + vm: &Vm, virtio_device: &Arc>, ) -> Result<(), PciManagerError> { for bar in &virtio_device.lock().expect("Poisoned lock").bar_regions { @@ -97,11 +96,8 @@ impl PciDevices { bar.size() ); #[cfg(target_arch = "x86_64")] - resource_allocator.pio_bus.insert( - virtio_device.clone(), - bar.addr(), - bar.size(), - )?; + vm.pio_bus + .insert(virtio_device.clone(), bar.addr(), bar.size())?; #[cfg(target_arch = "aarch64")] log::error!("pci: We do not support I/O region allocation") } @@ -111,11 +107,9 @@ impl PciDevices { bar.addr(), bar.size() ); - resource_allocator.mmio_bus.insert( - virtio_device.clone(), - bar.addr(), - bar.size(), - )?; + vm.common + .mmio_bus + .insert(virtio_device.clone(), bar.addr(), bar.size())?; } } } @@ -171,7 +165,7 @@ impl PciDevices { self.virtio_devices .insert((device_type, id.clone()), virtio_device.clone()); - Self::register_bars_with_bus(resource_allocator, &virtio_device)?; + Self::register_bars_with_bus(vm, &virtio_device)?; virtio_device .lock() .expect("Poisoned lock") @@ -216,7 +210,7 @@ impl PciDevices { self.virtio_devices .insert((device_type, device_id.to_string()), virtio_device.clone()); - Self::register_bars_with_bus(&vm.common.resource_allocator, &virtio_device)?; + Self::register_bars_with_bus(vm, &virtio_device)?; virtio_device .lock() .expect("Poisoned lock") diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 1273de48ba8..fca94595372 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -371,7 +371,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { if state.type_ == DeviceType::Rtc { let rtc = Arc::new(Mutex::new(RTCDevice::new())); dev_manager.register_mmio_rtc( - &constructor_args.vm.common.resource_allocator, + constructor_args.vm, rtc, Some(state.device_info), )?; diff --git a/src/vmm/src/devices/pci/pci_segment.rs b/src/vmm/src/devices/pci/pci_segment.rs index e957332bb0e..c37763eab3a 100644 --- a/src/vmm/src/devices/pci/pci_segment.rs +++ b/src/vmm/src/devices/pci/pci_segment.rs @@ -21,7 +21,7 @@ use uuid::Uuid; use vm_allocator::AddressAllocator; use vm_device::{BusDeviceSync, BusError}; -use crate::arch::{PCI_MMCONFIG_START, PCI_MMIO_CONFIG_SIZE_PER_SEGMENT}; +use crate::arch::{ArchVm as Vm, PCI_MMCONFIG_START, PCI_MMIO_CONFIG_SIZE_PER_SEGMENT}; use crate::vstate::resources::ResourceAllocator; pub struct PciSegment { @@ -67,28 +67,21 @@ impl std::fmt::Debug for PciSegment { } impl PciSegment { - fn build( - id: u16, - resource_allocator: &Arc, - pci_irq_slots: &[u8; 32], - ) -> Result { + fn build(id: u16, vm: &Arc, pci_irq_slots: &[u8; 32]) -> Result { let pci_root = PciRoot::new(None); - let pci_bus = Arc::new(Mutex::new(PciBus::new( - pci_root, - resource_allocator.clone(), - ))); + let pci_bus = Arc::new(Mutex::new(PciBus::new(pci_root, vm.clone()))); let pci_config_mmio = Arc::new(Mutex::new(PciConfigMmio::new(Arc::clone(&pci_bus)))); let mmio_config_address = PCI_MMCONFIG_START + PCI_MMIO_CONFIG_SIZE_PER_SEGMENT * id as u64; - resource_allocator.mmio_bus.insert( + vm.common.mmio_bus.insert( Arc::clone(&pci_config_mmio) as Arc, mmio_config_address, PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, )?; - let mem32_allocator = resource_allocator.mmio32_memory.clone(); - let mem64_allocator = resource_allocator.mmio64_memory.clone(); + let mem32_allocator = vm.common.resource_allocator.mmio32_memory.clone(); + let mem64_allocator = vm.common.resource_allocator.mmio64_memory.clone(); let start_of_mem32_area = mem32_allocator.lock().unwrap().base(); let end_of_mem32_area = mem32_allocator.lock().unwrap().end(); @@ -119,13 +112,15 @@ impl PciSegment { #[cfg(target_arch = "x86_64")] pub(crate) fn new( id: u16, - resource_allocator: &Arc, + vm: &Arc, pci_irq_slots: &[u8; 32], ) -> Result { - let mut segment = Self::build(id, resource_allocator, pci_irq_slots)?; + use crate::Vm; + + let mut segment = Self::build(id, vm, pci_irq_slots)?; let pci_config_io = Arc::new(Mutex::new(PciConfigIo::new(Arc::clone(&segment.pci_bus)))); - resource_allocator.pio_bus.insert( + vm.pio_bus.insert( pci_config_io.clone(), PCI_CONFIG_IO_PORT, PCI_CONFIG_IO_PORT_SIZE, @@ -151,10 +146,10 @@ impl PciSegment { #[cfg(target_arch = "aarch64")] pub(crate) fn new( id: u16, - resource_allocator: &Arc, + vm: &Arc, pci_irq_slots: &[u8; 32], ) -> Result { - let segment = Self::build(id, resource_allocator, pci_irq_slots)?; + let segment = Self::build(id, vm, pci_irq_slots)?; info!( "pci: adding PCI segment: id={:#x}, PCI MMIO config address: {:#x}, mem32 area: \ [{:#x}-{:#x}], mem64 area: [{:#x}-{:#x}]", @@ -468,13 +463,14 @@ mod tests { use super::*; use crate::arch; + use crate::builder::tests::default_vmm; use crate::utils::u64_to_usize; #[test] fn test_pci_segment_build() { - let resource_allocator = Arc::new(ResourceAllocator::new().unwrap()); + let vmm = default_vmm(); let pci_irq_slots = &[0u8; 32]; - let pci_segment = PciSegment::new(0, &resource_allocator, pci_irq_slots).unwrap(); + let pci_segment = PciSegment::new(0, &vmm.vm, pci_irq_slots).unwrap(); assert_eq!(pci_segment.id, 0); assert_eq!( @@ -503,17 +499,14 @@ mod tests { #[cfg(target_arch = "x86_64")] #[test] fn test_io_bus() { - let resource_allocator = Arc::new(ResourceAllocator::new().unwrap()); + let vmm = default_vmm(); let pci_irq_slots = &[0u8; 32]; - let pci_segment = PciSegment::new(0, &resource_allocator, pci_irq_slots).unwrap(); + let pci_segment = PciSegment::new(0, &vmm.vm, pci_irq_slots).unwrap(); let mut data = [0u8; u64_to_usize(PCI_CONFIG_IO_PORT_SIZE)]; - resource_allocator - .pio_bus - .read(PCI_CONFIG_IO_PORT, &mut data) - .unwrap(); + vmm.vm.pio_bus.read(PCI_CONFIG_IO_PORT, &mut data).unwrap(); - resource_allocator + vmm.vm .pio_bus .read(PCI_CONFIG_IO_PORT + PCI_CONFIG_IO_PORT_SIZE, &mut data) .unwrap_err(); @@ -521,17 +514,19 @@ mod tests { #[test] fn test_mmio_bus() { - let resource_allocator = Arc::new(ResourceAllocator::new().unwrap()); + let vmm = default_vmm(); let pci_irq_slots = &[0u8; 32]; - let pci_segment = PciSegment::new(0, &resource_allocator, pci_irq_slots).unwrap(); + let pci_segment = PciSegment::new(0, &vmm.vm, pci_irq_slots).unwrap(); let mut data = [0u8; u64_to_usize(PCI_MMIO_CONFIG_SIZE_PER_SEGMENT)]; - resource_allocator + vmm.vm + .common .mmio_bus .read(pci_segment.mmio_config_address, &mut data) .unwrap(); - resource_allocator + vmm.vm + .common .mmio_bus .read( pci_segment.mmio_config_address + PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, @@ -542,9 +537,9 @@ mod tests { #[test] fn test_next_device_bdf() { - let resource_allocator = Arc::new(ResourceAllocator::new().unwrap()); + let vmm = default_vmm(); let pci_irq_slots = &[0u8; 32]; - let pci_segment = PciSegment::new(0, &resource_allocator, pci_irq_slots).unwrap(); + let pci_segment = PciSegment::new(0, &vmm.vm, pci_irq_slots).unwrap(); // Start checking from device id 1, since 0 is allocated to the Root port. for dev_id in 1..32 { diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 2b558d566d4..e53439373c7 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -371,10 +371,9 @@ impl Vmm { self.vcpus_handles.reserve(vcpu_count); for mut vcpu in vcpus.drain(..) { - vcpu.set_mmio_bus(self.vm.common.resource_allocator.mmio_bus.clone()); + vcpu.set_mmio_bus(self.vm.common.mmio_bus.clone()); #[cfg(target_arch = "x86_64")] - vcpu.kvm_vcpu - .set_pio_bus(self.vm.common.resource_allocator.pio_bus.clone()); + vcpu.kvm_vcpu.set_pio_bus(self.vm.pio_bus.clone()); self.vcpus_handles .push(vcpu.start_threaded(vcpu_seccomp_filter.clone(), barrier.clone())?); diff --git a/src/vmm/src/vstate/resources.rs b/src/vmm/src/vstate/resources.rs index 91522de4cb1..b0cb0ab625d 100644 --- a/src/vmm/src/vstate/resources.rs +++ b/src/vmm/src/vstate/resources.rs @@ -4,11 +4,9 @@ use std::convert::Infallible; use std::sync::{Arc, Mutex}; -use pci::DeviceRelocation; use serde::{Deserialize, Serialize}; pub use vm_allocator::AllocPolicy; use vm_allocator::{AddressAllocator, IdAllocator}; -use vm_device::Bus; use crate::arch; use crate::snapshot::Persist; @@ -30,11 +28,6 @@ pub struct ResourceAllocator { pub mmio64_memory: Arc>, /// Memory allocator for system data pub system_memory: Arc>, - /// MMIO bus - pub mmio_bus: Arc, - #[cfg(target_arch = "x86_64")] - /// Port IO bus - pub pio_bus: Arc, } impl ResourceAllocator { @@ -54,9 +47,6 @@ impl ResourceAllocator { arch::SYSTEM_MEM_START, arch::SYSTEM_MEM_SIZE, )?)), - mmio_bus: Arc::new(Bus::new()), - #[cfg(target_arch = "x86_64")] - pio_bus: Arc::new(Bus::new()), }) } @@ -178,9 +168,6 @@ impl<'a> Persist<'a> for ResourceAllocator { mmio32_memory: state.mmio32_memory.clone(), mmio64_memory: state.mmio64_memory.clone(), system_memory: state.system_memory.clone(), - mmio_bus: Arc::new(Bus::new()), - #[cfg(target_arch = "x86_64")] - pio_bus: Arc::new(Bus::new()), }) } } @@ -219,19 +206,6 @@ impl Default for ResourceAllocatorState { } } -impl DeviceRelocation for ResourceAllocator { - fn move_bar( - &self, - _old_base: u64, - _new_base: u64, - _len: u64, - _pci_dev: &mut dyn pci::PciDevice, - _region_type: pci::PciBarRegionType, - ) -> Result<(), std::io::Error> { - todo!() - } -} - #[cfg(test)] mod tests { use vm_allocator::AllocPolicy; diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index c19c7c26bd4..ff29b1a2d1d 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -19,7 +19,8 @@ use kvm_bindings::{ KvmIrqRouting, kvm_irq_routing_entry, kvm_userspace_memory_region, }; use kvm_ioctls::VmFd; -use log::debug; +use log::{debug, error}; +use pci::DeviceRelocation; use serde::{Deserialize, Serialize}; use vm_device::interrupt::{ InterruptIndex, InterruptSourceConfig, InterruptSourceGroup, MsiIrqSourceConfig, @@ -247,6 +248,8 @@ pub struct VmCommon { pub interrupts: Mutex>, /// Allocator for VM resources pub resource_allocator: Arc, + /// MMIO bus + pub mmio_bus: Arc, } /// Errors associated with the wrappers over KVM ioctls. @@ -322,6 +325,7 @@ impl Vm { guest_memory: GuestMemoryMmap::default(), interrupts: Mutex::new(HashMap::new()), resource_allocator: Arc::new(ResourceAllocator::new()?), + mmio_bus: Arc::new(vm_device::Bus::new()), }) } @@ -653,6 +657,20 @@ fn mincore_bitmap(region: &GuestRegionMmap) -> Result, VmError> { Ok(bitmap) } +impl DeviceRelocation for Vm { + fn move_bar( + &self, + _old_base: u64, + _new_base: u64, + _len: u64, + _pci_dev: &mut dyn pci::PciDevice, + _region_type: pci::PciBarRegionType, + ) -> Result<(), std::io::Error> { + error!("pci: device relocation not supported"); + Err(std::io::Error::from(std::io::ErrorKind::Unsupported)) + } +} + #[cfg(test)] pub(crate) mod tests { use vm_device::interrupt::{InterruptSourceConfig, LegacyIrqSourceConfig}; From 8c431706119d916ba2fc43b36c0c81eaac7c2511 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 18 Jun 2025 23:39:14 +0200 Subject: [PATCH 49/99] arm: support MSI-X on ARM Add support for ITS device which provides support for MSI interrupts on ARM architecture. This is currently supported only on systems with GICv3 interrupt controller. In order to make saving/restore of ITS state work properly, we need to change the order in which we restore redistributor register GICR_CTLR. We need to make sure that this register is restored last. Otherwise, restoring GICR_PROPBASER doesn't have any effect and ITS depends on it in order to save/restore ITS tables to/from guest memory. Signed-off-by: Babis Chalios --- src/vmm/src/arch/aarch64/fdt.rs | 30 ++++ src/vmm/src/arch/aarch64/gic/gicv2/mod.rs | 4 +- .../src/arch/aarch64/gic/gicv2/regs/mod.rs | 1 + src/vmm/src/arch/aarch64/gic/gicv3/mod.rs | 88 +++++++++--- .../arch/aarch64/gic/gicv3/regs/its_regs.rs | 135 ++++++++++++++++++ .../src/arch/aarch64/gic/gicv3/regs/mod.rs | 48 +++++-- .../aarch64/gic/gicv3/regs/redist_regs.rs | 2 +- src/vmm/src/arch/aarch64/gic/mod.rs | 22 +++ src/vmm/src/arch/aarch64/gic/regs.rs | 3 + src/vmm/src/arch/aarch64/output_GICv3.dtb | Bin 2097152 -> 2097152 bytes .../src/arch/aarch64/output_initrd_GICv3.dtb | Bin 2097152 -> 2097152 bytes 11 files changed, 298 insertions(+), 35 deletions(-) create mode 100644 src/vmm/src/arch/aarch64/gic/gicv3/regs/its_regs.rs diff --git a/src/vmm/src/arch/aarch64/fdt.rs b/src/vmm/src/arch/aarch64/fdt.rs index 8ae5e764df4..429153669fa 100644 --- a/src/vmm/src/arch/aarch64/fdt.rs +++ b/src/vmm/src/arch/aarch64/fdt.rs @@ -28,6 +28,8 @@ use crate::vstate::memory::{Address, GuestMemory, GuestMemoryMmap}; const GIC_PHANDLE: u32 = 1; // This is a value for uniquely identifying the FDT node containing the clock definition. const CLOCK_PHANDLE: u32 = 2; +// This is a value for uniquely identifying the FDT node declaring the MSI controller. +const MSI_PHANDLE: u32 = 3; // You may be wondering why this big value? // This phandle is used to uniquely identify the FDT nodes containing cache information. Each cpu // can have a variable number of caches, some of these caches may be shared with other cpus. @@ -302,6 +304,16 @@ fn create_gic_node(fdt: &mut FdtWriter, gic_device: &GICDevice) -> Result<(), Fd ]; fdt.property_array_u32("interrupts", &gic_intr)?; + + if let Some(msi_properties) = gic_device.msi_properties() { + let msic_node = fdt.begin_node("msic")?; + fdt.property_string("compatible", "arm,gic-v3-its")?; + fdt.property_null("msi-controller")?; + fdt.property_u32("phandle", MSI_PHANDLE)?; + fdt.property_array_u64("reg", msi_properties)?; + fdt.end_node(msic_node)?; + } + fdt.end_node(interrupt)?; Ok(()) @@ -471,6 +483,21 @@ fn create_pci_nodes(fdt: &mut FdtWriter, pci_devices: &PciDevices) -> Result<(), (MEM_64BIT_DEVICES_SIZE >> 32) as u32, // Range size ((MEM_64BIT_DEVICES_SIZE & 0xffff_ffff) >> 32) as u32, ]; + + // See kernel document Documentation/devicetree/bindings/pci/pci-msi.txt + let msi_map = [ + // rid-base: A single cell describing the first RID matched by the entry. + 0x0, + // msi-controller: A single phandle to an MSI controller. + MSI_PHANDLE, + // msi-base: An msi-specifier describing the msi-specifier produced for the + // first RID matched by the entry. + segment.id as u32, + // length: A single cell describing how many consecutive RIDs are matched + // following the rid-base. + 0x100, + ]; + let pci_node = fdt.begin_node(&pci_node_name)?; fdt.property_string("compatible", "pci-host-ecam-generic")?; @@ -491,6 +518,9 @@ fn create_pci_nodes(fdt: &mut FdtWriter, pci_devices: &PciDevices) -> Result<(), fdt.property_null("interrupt-map")?; fdt.property_null("interrupt-map-mask")?; fdt.property_null("dma-coherent")?; + fdt.property_array_u32("msi-map", &msi_map)?; + fdt.property_u32("msi-parent", MSI_PHANDLE)?; + Ok(fdt.end_node(pci_node)?) } diff --git a/src/vmm/src/arch/aarch64/gic/gicv2/mod.rs b/src/vmm/src/arch/aarch64/gic/gicv2/mod.rs index c4b9208a0a6..dfa2302d6be 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv2/mod.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv2/mod.rs @@ -68,7 +68,9 @@ impl GICv2 { GICv2::get_cpu_addr(), GICv2::get_cpu_size(), ], + msi_properties: None, vcpu_count, + its_device: None, }) } @@ -82,7 +84,7 @@ impl GICv2 { pub fn init_device_attributes(gic_device: &Self) -> Result<(), GicError> { // Setting up the distributor attribute. - // We are placing the GIC below 1GB so we need to substract the size of the distributor. + // We are placing the GIC below 1GB so we need to subtract the size of the distributor. Self::set_device_attribute( gic_device.device_fd(), kvm_bindings::KVM_DEV_ARM_VGIC_GRP_ADDR, diff --git a/src/vmm/src/arch/aarch64/gic/gicv2/regs/mod.rs b/src/vmm/src/arch/aarch64/gic/gicv2/regs/mod.rs index 8bb26ce2bcd..2b617716fe2 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv2/regs/mod.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv2/regs/mod.rs @@ -22,6 +22,7 @@ pub fn save_state(fd: &DeviceFd, mpidrs: &[u64]) -> Result { Ok(GicState { dist: dist_regs::get_dist_regs(fd)?, gic_vcpu_states: vcpu_states, + ..Default::default() }) } diff --git a/src/vmm/src/arch/aarch64/gic/gicv3/mod.rs b/src/vmm/src/arch/aarch64/gic/gicv3/mod.rs index 39c4e5ce148..075687bc23e 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv3/mod.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv3/mod.rs @@ -1,7 +1,7 @@ // Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 -mod regs; +pub mod regs; use kvm_ioctls::{DeviceFd, VmFd}; @@ -18,12 +18,19 @@ impl std::ops::Deref for GICv3 { } } +impl std::ops::DerefMut for GICv3 { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + impl GICv3 { // Unfortunately bindgen omits defines that are based on other defines. // See arch/arm64/include/uapi/asm/kvm.h file from the linux kernel. const SZ_64K: u64 = 0x0001_0000; const KVM_VGIC_V3_DIST_SIZE: u64 = GICv3::SZ_64K; const KVM_VGIC_V3_REDIST_SIZE: u64 = (2 * GICv3::SZ_64K); + const GIC_V3_ITS_SIZE: u64 = 0x2_0000; // Device trees specific constants const ARCH_GIC_V3_MAINT_IRQ: u32 = 9; @@ -48,6 +55,16 @@ impl GICv3 { vcpu_count * GICv3::KVM_VGIC_V3_REDIST_SIZE } + /// Get the MSI address + fn get_msi_address(vcpu_count: u64) -> u64 { + Self::get_redists_addr(vcpu_count) - GICv3::GIC_V3_ITS_SIZE + } + + /// Get the MSI size + const fn get_msi_size() -> u64 { + GICv3::GIC_V3_ITS_SIZE + } + pub const VERSION: u32 = kvm_bindings::kvm_device_type_KVM_DEV_TYPE_ARM_VGIC_V3; pub fn fdt_compatibility(&self) -> &str { @@ -59,30 +76,43 @@ impl GICv3 { } /// Create the GIC device object - pub fn create_device(fd: DeviceFd, vcpu_count: u64) -> Self { - GICv3(super::GIC { - fd, + pub fn create_device(vm: &VmFd, vcpu_count: u64) -> Result { + // Create the GIC device + let mut gic_device = kvm_bindings::kvm_create_device { + type_: Self::VERSION, + fd: 0, + flags: 0, + }; + + let gic_fd = vm + .create_device(&mut gic_device) + .map_err(GicError::CreateGIC)?; + + Ok(GICv3(super::GIC { + fd: gic_fd, properties: [ GICv3::get_dist_addr(), GICv3::get_dist_size(), GICv3::get_redists_addr(vcpu_count), GICv3::get_redists_size(vcpu_count), ], + msi_properties: Some([GICv3::get_msi_address(vcpu_count), GICv3::get_msi_size()]), vcpu_count, - }) + its_device: None, + })) } pub fn save_device(&self, mpidrs: &[u64]) -> Result { - regs::save_state(&self.fd, mpidrs) + regs::save_state(&self.fd, self.its_device.as_ref().unwrap(), mpidrs) } pub fn restore_device(&self, mpidrs: &[u64], state: &GicState) -> Result<(), GicError> { - regs::restore_state(&self.fd, mpidrs, state) + regs::restore_state(&self.fd, self.its_device.as_ref().unwrap(), mpidrs, state) } pub fn init_device_attributes(gic_device: &Self) -> Result<(), GicError> { // Setting up the distributor attribute. - // We are placing the GIC below 1GB so we need to substract the size of the distributor. + // We are placing the GIC below 1GB so we need to subtract the size of the distributor. Self::set_device_attribute( gic_device.device_fd(), kvm_bindings::KVM_DEV_ARM_VGIC_GRP_ADDR, @@ -104,25 +134,45 @@ impl GICv3 { Ok(()) } - /// Initialize a GIC device - pub fn init_device(vm: &VmFd) -> Result { - let mut gic_device = kvm_bindings::kvm_create_device { - type_: Self::VERSION, + fn init_its(vm: &VmFd, gic_device: &mut Self) -> Result<(), GicError> { + // ITS part attributes + let mut its_device = kvm_bindings::kvm_create_device { + type_: kvm_bindings::kvm_device_type_KVM_DEV_TYPE_ARM_VGIC_ITS, fd: 0, flags: 0, }; - vm.create_device(&mut gic_device) - .map_err(GicError::CreateGIC) + let its_fd = vm + .create_device(&mut its_device) + .map_err(GicError::CreateGIC)?; + + // Setting up the ITS attributes + Self::set_device_attribute( + &its_fd, + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_ADDR, + u64::from(kvm_bindings::KVM_VGIC_ITS_ADDR_TYPE), + &Self::get_msi_address(gic_device.vcpu_count()) as *const u64 as u64, + 0, + )?; + + Self::set_device_attribute( + &its_fd, + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_CTRL, + u64::from(kvm_bindings::KVM_DEV_ARM_VGIC_CTRL_INIT), + 0, + 0, + )?; + + gic_device.its_device = Some(its_fd); + Ok(()) } /// Method to initialize the GIC device pub fn create(vm: &VmFd, vcpu_count: u64) -> Result { - let vgic_fd = Self::init_device(vm)?; - - let device = Self::create_device(vgic_fd, vcpu_count); + let mut device = Self::create_device(vm, vcpu_count)?; Self::init_device_attributes(&device)?; + Self::init_its(vm, &mut device)?; Self::finalize_device(&device)?; @@ -184,14 +234,14 @@ impl GICv3 { /// RDIST pending tables into guest RAM. /// /// The tables get flushed to guest RAM whenever the VM gets stopped. -fn save_pending_tables(fd: &DeviceFd) -> Result<(), GicError> { +fn save_pending_tables(gic_device: &DeviceFd) -> Result<(), GicError> { let init_gic_attr = kvm_bindings::kvm_device_attr { group: kvm_bindings::KVM_DEV_ARM_VGIC_GRP_CTRL, attr: u64::from(kvm_bindings::KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES), addr: 0, flags: 0, }; - fd.set_device_attr(&init_gic_attr).map_err(|err| { + gic_device.set_device_attr(&init_gic_attr).map_err(|err| { GicError::DeviceAttribute(err, true, kvm_bindings::KVM_DEV_ARM_VGIC_GRP_CTRL) }) } diff --git a/src/vmm/src/arch/aarch64/gic/gicv3/regs/its_regs.rs b/src/vmm/src/arch/aarch64/gic/gicv3/regs/its_regs.rs new file mode 100644 index 00000000000..ee4ecafba1e --- /dev/null +++ b/src/vmm/src/arch/aarch64/gic/gicv3/regs/its_regs.rs @@ -0,0 +1,135 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use kvm_bindings::{ + KVM_DEV_ARM_ITS_RESTORE_TABLES, KVM_DEV_ARM_ITS_SAVE_TABLES, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_GRP_ITS_REGS, +}; +use kvm_ioctls::DeviceFd; +use serde::{Deserialize, Serialize}; + +use crate::arch::aarch64::gic::GicError; + +// ITS registers that we want to preserve across snapshots +const GITS_CTLR: u32 = 0x0000; +const GITS_IIDR: u32 = 0x0004; +const GITS_CBASER: u32 = 0x0080; +const GITS_CWRITER: u32 = 0x0088; +const GITS_CREADR: u32 = 0x0090; +const GITS_BASER: u32 = 0x0100; + +fn set_device_attribute( + its_device: &DeviceFd, + group: u32, + attr: u32, + val: u64, +) -> Result<(), GicError> { + let gicv3_its_attr = kvm_bindings::kvm_device_attr { + group, + attr: attr as u64, + addr: &val as *const u64 as u64, + flags: 0, + }; + + its_device + .set_device_attr(&gicv3_its_attr) + .map_err(|err| GicError::DeviceAttribute(err, true, group)) +} + +fn get_device_attribute(its_device: &DeviceFd, group: u32, attr: u32) -> Result { + let mut val = 0; + + let mut gicv3_its_attr = kvm_bindings::kvm_device_attr { + group, + attr: attr as u64, + addr: &mut val as *mut u64 as u64, + flags: 0, + }; + + // SAFETY: gicv3_its_attr.addr is safe to write to. + unsafe { its_device.get_device_attr(&mut gicv3_its_attr) } + .map_err(|err| GicError::DeviceAttribute(err, false, group))?; + + Ok(val) +} + +fn its_read_register(its_fd: &DeviceFd, attr: u32) -> Result { + get_device_attribute(its_fd, KVM_DEV_ARM_VGIC_GRP_ITS_REGS, attr) +} + +fn its_set_register(its_fd: &DeviceFd, attr: u32, val: u64) -> Result<(), GicError> { + set_device_attribute(its_fd, KVM_DEV_ARM_VGIC_GRP_ITS_REGS, attr, val) +} + +pub fn its_save_tables(its_fd: &DeviceFd) -> Result<(), GicError> { + set_device_attribute( + its_fd, + KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_ITS_SAVE_TABLES, + 0, + ) +} + +pub fn its_restore_tables(its_fd: &DeviceFd) -> Result<(), GicError> { + set_device_attribute( + its_fd, + KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_ITS_RESTORE_TABLES, + 0, + ) +} + +/// ITS registers that we save/restore during snapshot +#[derive(Debug, Default, Serialize, Deserialize)] +pub struct ItsRegisterState { + iidr: u64, + cbaser: u64, + creadr: u64, + cwriter: u64, + baser: [u64; 8], + ctlr: u64, +} + +impl ItsRegisterState { + /// Save ITS state + pub fn save(its_fd: &DeviceFd) -> Result { + let mut state = ItsRegisterState::default(); + + for i in 0..8 { + state.baser[i as usize] = its_read_register(its_fd, GITS_BASER + i * 8)?; + } + state.ctlr = its_read_register(its_fd, GITS_CTLR)?; + state.cbaser = its_read_register(its_fd, GITS_CBASER)?; + state.creadr = its_read_register(its_fd, GITS_CREADR)?; + state.cwriter = its_read_register(its_fd, GITS_CWRITER)?; + state.iidr = its_read_register(its_fd, GITS_IIDR)?; + + Ok(state) + } + + /// Restore ITS state + /// + /// We need to restore ITS registers in a very specific order for things to work. Take a look + /// at: + /// https://elixir.bootlin.com/linux/v6.1.141/source/Documentation/virt/kvm/devices/arm-vgic-its.rst#L60 + /// and + /// https://elixir.bootlin.com/linux/v6.1.141/source/Documentation/virt/kvm/devices/arm-vgic-its.rst#L123 + /// + /// for more details, but TL;DR is: + /// + /// We need to restore GITS_CBASER, GITS_CREADER, GITS_CWRITER, GITS_BASER and GITS_IIDR + /// registers before restoring ITS tables from guest memory. We also need to set GITS_CTLR + /// last. + pub fn restore(&self, its_fd: &DeviceFd) -> Result<(), GicError> { + its_set_register(its_fd, GITS_IIDR, self.iidr)?; + its_set_register(its_fd, GITS_CBASER, self.cbaser)?; + its_set_register(its_fd, GITS_CREADR, self.creadr)?; + its_set_register(its_fd, GITS_CWRITER, self.cwriter)?; + for i in 0..8 { + its_set_register(its_fd, GITS_BASER + i * 8, self.baser[i as usize])?; + } + // We need to restore saved ITS tables before restoring GITS_CTLR + its_restore_tables(its_fd)?; + its_set_register(its_fd, GITS_CTLR, self.ctlr) + } +} diff --git a/src/vmm/src/arch/aarch64/gic/gicv3/regs/mod.rs b/src/vmm/src/arch/aarch64/gic/gicv3/regs/mod.rs index 0531766dc54..3df0d4642d7 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv3/regs/mod.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv3/regs/mod.rs @@ -3,45 +3,63 @@ mod dist_regs; mod icc_regs; +pub mod its_regs; mod redist_regs; +use its_regs::{ItsRegisterState, its_save_tables}; use kvm_ioctls::DeviceFd; use crate::arch::aarch64::gic::GicError; use crate::arch::aarch64::gic::regs::{GicState, GicVcpuState}; /// Save the state of the GIC device. -pub fn save_state(fd: &DeviceFd, mpidrs: &[u64]) -> Result { +pub fn save_state( + gic_device: &DeviceFd, + its_device: &DeviceFd, + mpidrs: &[u64], +) -> Result { // Flush redistributors pending tables to guest RAM. - super::save_pending_tables(fd)?; + super::save_pending_tables(gic_device)?; + // Flush ITS tables into guest memory. + its_save_tables(its_device)?; let mut vcpu_states = Vec::with_capacity(mpidrs.len()); for mpidr in mpidrs { vcpu_states.push(GicVcpuState { - rdist: redist_regs::get_redist_regs(fd, *mpidr)?, - icc: icc_regs::get_icc_regs(fd, *mpidr)?, + rdist: redist_regs::get_redist_regs(gic_device, *mpidr)?, + icc: icc_regs::get_icc_regs(gic_device, *mpidr)?, }) } + let its_state = ItsRegisterState::save(its_device)?; + Ok(GicState { - dist: dist_regs::get_dist_regs(fd)?, + dist: dist_regs::get_dist_regs(gic_device)?, gic_vcpu_states: vcpu_states, + its_state: Some(its_state), }) } /// Restore the state of the GIC device. -pub fn restore_state(fd: &DeviceFd, mpidrs: &[u64], state: &GicState) -> Result<(), GicError> { - dist_regs::set_dist_regs(fd, &state.dist)?; +pub fn restore_state( + gic_device: &DeviceFd, + its_device: &DeviceFd, + mpidrs: &[u64], + state: &GicState, +) -> Result<(), GicError> { + dist_regs::set_dist_regs(gic_device, &state.dist)?; if mpidrs.len() != state.gic_vcpu_states.len() { return Err(GicError::InconsistentVcpuCount); } for (mpidr, vcpu_state) in mpidrs.iter().zip(&state.gic_vcpu_states) { - redist_regs::set_redist_regs(fd, *mpidr, &vcpu_state.rdist)?; - icc_regs::set_icc_regs(fd, *mpidr, &vcpu_state.icc)?; + redist_regs::set_redist_regs(gic_device, *mpidr, &vcpu_state.rdist)?; + icc_regs::set_icc_regs(gic_device, *mpidr, &vcpu_state.icc)?; } - Ok(()) + // Safe to unwrap here, as we know we support an ITS device, so `its_state.is_some()` is always + // `true`. + state.its_state.as_ref().unwrap().restore(its_device) } #[cfg(test)] @@ -59,9 +77,10 @@ mod tests { let vm = kvm.create_vm().unwrap(); let gic = create_gic(&vm, 1, Some(GICVersion::GICV3)).expect("Cannot create gic"); let gic_fd = gic.device_fd(); + let its_fd = gic.its_fd().unwrap(); let mpidr = vec![1]; - let res = save_state(gic_fd, &mpidr); + let res = save_state(gic_fd, its_fd, &mpidr); // We will receive an error if trying to call before creating vcpu. assert_eq!( format!("{:?}", res.unwrap_err()), @@ -73,8 +92,9 @@ mod tests { let _vcpu = vm.create_vcpu(0).unwrap(); let gic = create_gic(&vm, 1, Some(GICVersion::GICV3)).expect("Cannot create gic"); let gic_fd = gic.device_fd(); + let its_fd = gic.its_fd().unwrap(); - let vm_state = save_state(gic_fd, &mpidr).unwrap(); + let vm_state = save_state(gic_fd, its_fd, &mpidr).unwrap(); let val: u32 = 0; let gicd_statusr_off = 0x0010u64; let mut gic_dist_attr = kvm_bindings::kvm_device_attr { @@ -94,7 +114,7 @@ mod tests { assert_eq!(gicd_statusr.chunks[0], val); assert_eq!(vm_state.dist.len(), 12); - restore_state(gic_fd, &mpidr, &vm_state).unwrap(); - restore_state(gic_fd, &[1, 2], &vm_state).unwrap_err(); + restore_state(gic_fd, its_fd, &mpidr, &vm_state).unwrap(); + restore_state(gic_fd, its_fd, &[1, 2], &vm_state).unwrap_err(); } } diff --git a/src/vmm/src/arch/aarch64/gic/gicv3/regs/redist_regs.rs b/src/vmm/src/arch/aarch64/gic/gicv3/regs/redist_regs.rs index 4d1ba3292c1..96aaebc87bd 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv3/regs/redist_regs.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv3/regs/redist_regs.rs @@ -28,11 +28,11 @@ const GICR_ICFGR0: SimpleReg = SimpleReg::new(GICR_SGI_OFFSET + 0x0C00, 8); // List with relevant redistributor registers that we will be restoring. static VGIC_RDIST_REGS: &[SimpleReg] = &[ - GICR_CTLR, GICR_STATUSR, GICR_WAKER, GICR_PROPBASER, GICR_PENDBASER, + GICR_CTLR, ]; // List with relevant SGI associated redistributor registers that we will be restoring. diff --git a/src/vmm/src/arch/aarch64/gic/mod.rs b/src/vmm/src/arch/aarch64/gic/mod.rs index cda423f478c..9bfabee1fea 100644 --- a/src/vmm/src/arch/aarch64/gic/mod.rs +++ b/src/vmm/src/arch/aarch64/gic/mod.rs @@ -21,8 +21,14 @@ pub struct GIC { /// GIC device properties, to be used for setting up the fdt entry properties: [u64; 4], + /// MSI properties of the GIC device + msi_properties: Option<[u64; 2]>, + /// Number of CPUs handled by the device vcpu_count: u64, + + /// ITS device + its_device: Option, } impl GIC { /// Returns the file descriptor of the GIC device @@ -80,6 +86,14 @@ impl GICDevice { } } + /// Returns the file descriptor of the ITS device, if any + pub fn its_fd(&self) -> Option<&DeviceFd> { + match self { + Self::V2(_) => None, + Self::V3(x) => x.its_device.as_ref(), + } + } + /// Returns an array with GIC device properties pub fn device_properties(&self) -> &[u64] { match self { @@ -88,6 +102,14 @@ impl GICDevice { } } + /// Returns an array with MSI properties if GIC supports it + pub fn msi_properties(&self) -> Option<&[u64; 2]> { + match self { + Self::V2(x) => x.msi_properties.as_ref(), + Self::V3(x) => x.msi_properties.as_ref(), + } + } + /// Returns the number of vCPUs this GIC handles pub fn vcpu_count(&self) -> u64 { match self { diff --git a/src/vmm/src/arch/aarch64/gic/regs.rs b/src/vmm/src/arch/aarch64/gic/regs.rs index 60987cc973d..1afa7acde9c 100644 --- a/src/vmm/src/arch/aarch64/gic/regs.rs +++ b/src/vmm/src/arch/aarch64/gic/regs.rs @@ -10,6 +10,7 @@ use kvm_ioctls::DeviceFd; use serde::{Deserialize, Serialize}; use crate::arch::aarch64::gic::GicError; +use crate::arch::aarch64::gic::gicv3::regs::its_regs::ItsRegisterState; #[derive(Debug, Serialize, Deserialize)] pub struct GicRegState { @@ -30,6 +31,8 @@ pub struct GicState { pub dist: Vec>, /// The state of the vcpu interfaces. pub gic_vcpu_states: Vec, + /// The state of the ITS device. Only present with GICv3. + pub its_state: Option, } /// Structure used for serializing the state of the GIC registers for a specific vCPU. diff --git a/src/vmm/src/arch/aarch64/output_GICv3.dtb b/src/vmm/src/arch/aarch64/output_GICv3.dtb index 03fba87f4fedcb57536d5219315cbe6474adc7b9..35f4e9b63a35caa91b793f37e857fe3ae1c3f3aa 100644 GIT binary patch delta 357 zcmYMuy-EW?5Ww->yIVDCl1t)O{5VOMB3B72SO`|O;u9Q)%0WKhE?8MCSmXir0ek@) zLGT&USy*Umt=KE5|49S~emk=>%(B*6ZP9)#f4?hb5~b9>Qo%8~jT%A_x=BBDEz`I= zxbhO+E3$MtQ9ijUb&~fw7g-u#?!?)!m`r6RF^1Fu&i`t;%j+jlhhK6M-t=_$-kyGT zZ(h-Qopk>9KXiXrTC!ilkF)-%uDd7yrNX!3bkHB>gKqaMcelaOb!0}7(eU6Sfb?cC oi#e1qj|D8Ej0&mpamDL6FH?Hyqx%cf!So?tb`a45xjxq1YuB_ArGB|tOs0pfWAT&Zd|zX z8XmwaxEGD$-xIOm>nf_6qOvTL49(Zl&%09QNh#G<%D*KyQ9~7koY8OFI}<#gPQ9-5 zM{zWtC{3=ox|{ZoZsI7M9Eb6EluzBv1>5s~KKc5ub~CsA66*ZRy@WS^?VEQZGh5f` zp%n7g{{jhQ&~h!epvY+r`=~q8vNQc%=B_Xr4#w%Q*SkvXhktROn1{p&JZSiEy#f}n nh$5D-j1`nnMg>*Wu!;b6tYIA+*u)mLv4dUgVgDsD2c6GfW|v3F delta 249 zcmXBKy9ok86ouhC&Wz9X{eFXqMn)o{kywHaSb!K9nJg@9U;)#D4Qym;2?n-dEC`DK z;)RdHIou!!RDj7n|9y(srHCvgi7UIFi@@K{RQ2 zH0m&qr?_Y6HRqYF8oQnOxV>G6zCHU^Aps48cuOLMG&0B{2M>7^P(%r3R8U0?bu`dK U3vG1JMGt)puD%_HZ+EksA4L*6$N&HU From dde20a593fb9524b871b58cfe77ee4b85b0a0b08 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 17 Jun 2025 13:13:26 +0200 Subject: [PATCH 50/99] test: VirtIO PCI device create and restoration Refactor the test code that inserts VirtIO devices in a Vmm object and then add a test which creates a Vmm with PCI devices and then serializes and deserializes the device manager and ensures that everything is as restored as expected. Signed-off-by: Babis Chalios --- src/vmm/src/builder.rs | 12 -- src/vmm/src/device_manager/mod.rs | 30 ++++ src/vmm/src/device_manager/pci_mngr.rs | 185 +++++++++++++++++++++++++ 3 files changed, 215 insertions(+), 12 deletions(-) diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 82299e41150..f2da5a6d045 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -848,7 +848,6 @@ pub(crate) mod tests { assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_VSOCK, &vsock_dev_id) .is_some() ); @@ -874,7 +873,6 @@ pub(crate) mod tests { assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_RNG, ENTROPY_DEV_ID) .is_some() ); @@ -909,7 +907,6 @@ pub(crate) mod tests { assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BALLOON, BALLOON_DEV_ID) .is_some() ); @@ -961,7 +958,6 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=/dev/vda ro")); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); @@ -983,7 +979,6 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=PARTUUID=0eaa91a0-01 rw")); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); @@ -1006,7 +1001,6 @@ pub(crate) mod tests { assert!(!cmdline_contains(&cmdline, "root=/dev/vda")); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); @@ -1044,19 +1038,16 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=PARTUUID=0eaa91a0-01 rw")); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BLOCK, "root") .is_some() ); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BLOCK, "secondary") .is_some() ); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BLOCK, "third") .is_some() ); @@ -1086,7 +1077,6 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=/dev/vda rw")); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); @@ -1108,7 +1098,6 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=PARTUUID=0eaa91a0-01 ro")); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); @@ -1130,7 +1119,6 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=/dev/vda rw")); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index c641a1aac0e..2135711ea54 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -375,6 +375,36 @@ impl DeviceManager { Self::do_mark_virtio_queue_memory_dirty(virtio_device, mem); } } + + /// Get a VirtIO device of type `virtio_type` with ID `device_id` + pub fn get_virtio_device( + &self, + virtio_type: u32, + device_id: &str, + ) -> Option>> { + if self.pci_devices.pci_segment.is_some() { + let pci_device = self.pci_devices.get_virtio_device(virtio_type, device_id)?; + Some( + pci_device + .lock() + .expect("Poisoned lock") + .virtio_device() + .clone(), + ) + } else { + let mmio_device = self + .mmio_devices + .get_virtio_device(virtio_type, device_id)?; + Some( + mmio_device + .inner + .lock() + .expect("Poisoned lock") + .device() + .clone(), + ) + } + } } #[derive(Debug, Default, Clone, Serialize, Deserialize)] diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs index 3deefc946eb..4d3a6ebf0f7 100644 --- a/src/vmm/src/device_manager/pci_mngr.rs +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -589,3 +589,188 @@ impl<'a> Persist<'a> for PciDevices { Ok(pci_devices) } } + +#[cfg(test)] +mod tests { + use vmm_sys_util::tempfile::TempFile; + + use super::*; + use crate::builder::tests::*; + use crate::device_manager; + use crate::devices::virtio::block::CacheType; + use crate::mmds::data_store::MmdsVersion; + use crate::resources::VmmConfig; + use crate::snapshot::Snapshot; + use crate::vmm_config::balloon::BalloonDeviceConfig; + use crate::vmm_config::entropy::EntropyDeviceConfig; + use crate::vmm_config::net::NetworkInterfaceConfig; + use crate::vmm_config::vsock::VsockDeviceConfig; + + #[test] + fn test_device_manager_persistence() { + let mut buf = vec![0; 65536]; + // These need to survive so the restored blocks find them. + let _block_files; + let mut tmp_sock_file = TempFile::new().unwrap(); + tmp_sock_file.remove().unwrap(); + // Set up a vmm with one of each device, and get the serialized DeviceStates. + { + let mut event_manager = EventManager::new().expect("Unable to create EventManager"); + let mut vmm = default_vmm(); + vmm.device_manager.enable_pci(&vmm.vm).unwrap(); + let mut cmdline = default_kernel_cmdline(); + + // Add a balloon device. + let balloon_cfg = BalloonDeviceConfig { + amount_mib: 123, + deflate_on_oom: false, + stats_polling_interval_s: 1, + }; + insert_balloon_device(&mut vmm, &mut cmdline, &mut event_manager, balloon_cfg); + // Add a block device. + let drive_id = String::from("root"); + let block_configs = vec![CustomBlockConfig::new( + drive_id, + true, + None, + true, + CacheType::Unsafe, + )]; + _block_files = + insert_block_devices(&mut vmm, &mut cmdline, &mut event_manager, block_configs); + // Add a net device. + let network_interface = NetworkInterfaceConfig { + iface_id: String::from("netif"), + host_dev_name: String::from("hostname"), + guest_mac: None, + rx_rate_limiter: None, + tx_rate_limiter: None, + }; + insert_net_device_with_mmds( + &mut vmm, + &mut cmdline, + &mut event_manager, + network_interface, + MmdsVersion::V2, + ); + // Add a vsock device. + let vsock_dev_id = "vsock"; + let vsock_config = VsockDeviceConfig { + vsock_id: Some(vsock_dev_id.to_string()), + guest_cid: 3, + uds_path: tmp_sock_file.as_path().to_str().unwrap().to_string(), + }; + insert_vsock_device(&mut vmm, &mut cmdline, &mut event_manager, vsock_config); + // Add an entropy device. + let entropy_config = EntropyDeviceConfig::default(); + insert_entropy_device(&mut vmm, &mut cmdline, &mut event_manager, entropy_config); + + Snapshot::serialize(&mut buf.as_mut_slice(), &vmm.device_manager.save()).unwrap(); + } + + tmp_sock_file.remove().unwrap(); + + let mut event_manager = EventManager::new().expect("Unable to create EventManager"); + // Keep in mind we are re-creating here an empty DeviceManager. Restoring later on + // will create a new PciDevices manager different than vmm.pci_devices. We're doing + // this to avoid restoring the whole Vmm, since what we really need from Vmm is the Vm + // object and calling default_vmm() is the easiest way to create one. + let vmm = default_vmm(); + let device_manager_state: device_manager::DevicesState = + Snapshot::deserialize(&mut buf.as_slice()).unwrap(); + let vm_resources = &mut VmResources::default(); + let restore_args = PciDevicesConstructorArgs { + vm: vmm.vm.clone(), + mem: vmm.vm.guest_memory(), + vm_resources, + instance_id: "microvm-id", + restored_from_file: true, + event_manager: &mut event_manager, + }; + let _restored_dev_manager = + PciDevices::restore(restore_args, &device_manager_state.pci_state).unwrap(); + + let expected_vm_resources = format!( + r#"{{ + "balloon": {{ + "amount_mib": 123, + "deflate_on_oom": false, + "stats_polling_interval_s": 1 + }}, + "drives": [ + {{ + "drive_id": "root", + "partuuid": null, + "is_root_device": true, + "cache_type": "Unsafe", + "is_read_only": true, + "path_on_host": "{}", + "rate_limiter": null, + "io_engine": "Sync", + "socket": null + }} + ], + "boot-source": {{ + "kernel_image_path": "", + "initrd_path": null, + "boot_args": null + }}, + "cpu-config": null, + "logger": null, + "machine-config": {{ + "vcpu_count": 1, + "mem_size_mib": 128, + "smt": false, + "track_dirty_pages": false, + "huge_pages": "None" + }}, + "metrics": null, + "mmds-config": {{ + "version": "V2", + "network_interfaces": [ + "netif" + ], + "ipv4_address": "169.254.169.254", + "imds_compat": false + }}, + "network-interfaces": [ + {{ + "iface_id": "netif", + "host_dev_name": "hostname", + "guest_mac": null, + "rx_rate_limiter": null, + "tx_rate_limiter": null + }} + ], + "vsock": {{ + "guest_cid": 3, + "uds_path": "{}" + }}, + "entropy": {{ + "rate_limiter": null + }} +}}"#, + _block_files.last().unwrap().as_path().to_str().unwrap(), + tmp_sock_file.as_path().to_str().unwrap() + ); + + assert_eq!( + vm_resources + .mmds + .as_ref() + .unwrap() + .lock() + .unwrap() + .version(), + MmdsVersion::V2 + ); + assert_eq!( + device_manager_state.pci_state.mmds.unwrap().version, + MmdsVersion::V2 + ); + assert_eq!( + expected_vm_resources, + serde_json::to_string_pretty(&VmmConfig::from(&*vm_resources)).unwrap() + ); + } +} From fa1ea0e0c2c479c329420955e06db7193b9f4fd6 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Fri, 20 Jun 2025 14:14:58 +0200 Subject: [PATCH 51/99] test: enable PCI microVMs for performance testing Use pci_enabled fixture for boot time, block, and network tests to create PCI microVM variants as well. Signed-off-by: Babis Chalios --- tests/framework/microvm.py | 1 + .../performance/test_block.py | 3 ++- .../performance/test_boottime.py | 25 ++++++++++++++----- .../performance/test_network.py | 4 +-- .../performance/test_vsock.py | 3 ++- 5 files changed, 26 insertions(+), 10 deletions(-) diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py index 956889210c2..3bb15a5f449 100644 --- a/tests/framework/microvm.py +++ b/tests/framework/microvm.py @@ -504,6 +504,7 @@ def dimensions(self): "rootfs": self.rootfs_file.name, "vcpus": str(self.vcpus_count), "guest_memory": f"{self.mem_size_bytes / (1024 * 1024)}MB", + "pci": f"{self.pci_enabled}", } @property diff --git a/tests/integration_tests/performance/test_block.py b/tests/integration_tests/performance/test_block.py index dfd0728084a..7fe9216e559 100644 --- a/tests/integration_tests/performance/test_block.py +++ b/tests/integration_tests/performance/test_block.py @@ -168,6 +168,7 @@ def test_block_performance( fio_mode, fio_block_size, fio_engine, + pci_enabled, io_engine, metrics, results_dir, @@ -176,7 +177,7 @@ def test_block_performance( Execute block device emulation benchmarking scenarios. """ vm = microvm_factory.build(guest_kernel_acpi, rootfs, monitor_memory=False) - vm.spawn(log_level="Info", emit_metrics=True) + vm.spawn(log_level="Info", emit_metrics=True, pci=pci_enabled) vm.basic_config(vcpu_count=vcpus, mem_size_mib=GUEST_MEM_MIB) vm.add_net_iface() # Add a secondary block device for benchmark tests. diff --git a/tests/integration_tests/performance/test_boottime.py b/tests/integration_tests/performance/test_boottime.py index 69cacfd094a..c5d67470f1a 100644 --- a/tests/integration_tests/performance/test_boottime.py +++ b/tests/integration_tests/performance/test_boottime.py @@ -95,12 +95,12 @@ def to_ms(v, unit): def launch_vm_with_boot_timer( - microvm_factory, guest_kernel_acpi, rootfs_rw, vcpu_count, mem_size_mib + microvm_factory, guest_kernel_acpi, rootfs_rw, vcpu_count, mem_size_mib, pci_enabled ): """Launches a microVM with guest-timer and returns the reported metrics for it""" vm = microvm_factory.build(guest_kernel_acpi, rootfs_rw) vm.jailer.extra_args.update({"boot-timer": None}) - vm.spawn() + vm.spawn(pci=pci_enabled) vm.basic_config( vcpu_count=vcpu_count, mem_size_mib=mem_size_mib, @@ -116,9 +116,11 @@ def launch_vm_with_boot_timer( return (vm, boot_time_us, cpu_boot_time_us) -def test_boot_timer(microvm_factory, guest_kernel_acpi, rootfs): +def test_boot_timer(microvm_factory, guest_kernel_acpi, rootfs, pci_enabled): """Tests that the boot timer device works""" - launch_vm_with_boot_timer(microvm_factory, guest_kernel_acpi, rootfs, 1, 128) + launch_vm_with_boot_timer( + microvm_factory, guest_kernel_acpi, rootfs, 1, 128, pci_enabled + ) @pytest.mark.parametrize( @@ -127,13 +129,24 @@ def test_boot_timer(microvm_factory, guest_kernel_acpi, rootfs): ) @pytest.mark.nonci def test_boottime( - microvm_factory, guest_kernel_acpi, rootfs_rw, vcpu_count, mem_size_mib, metrics + microvm_factory, + guest_kernel_acpi, + rootfs_rw, + vcpu_count, + mem_size_mib, + pci_enabled, + metrics, ): """Test boot time with different guest configurations""" for i in range(10): vm, boot_time_us, cpu_boot_time_us = launch_vm_with_boot_timer( - microvm_factory, guest_kernel_acpi, rootfs_rw, vcpu_count, mem_size_mib + microvm_factory, + guest_kernel_acpi, + rootfs_rw, + vcpu_count, + mem_size_mib, + pci_enabled, ) if i == 0: diff --git a/tests/integration_tests/performance/test_network.py b/tests/integration_tests/performance/test_network.py index 115ed4196b7..035fb5a2b59 100644 --- a/tests/integration_tests/performance/test_network.py +++ b/tests/integration_tests/performance/test_network.py @@ -38,7 +38,7 @@ def consume_ping_output(ping_putput): @pytest.fixture -def network_microvm(request, microvm_factory, guest_kernel_acpi, rootfs): +def network_microvm(request, microvm_factory, guest_kernel_acpi, rootfs, pci_enabled): """Creates a microvm with the networking setup used by the performance tests in this file. This fixture receives its vcpu count via indirect parameterization""" @@ -46,7 +46,7 @@ def network_microvm(request, microvm_factory, guest_kernel_acpi, rootfs): guest_vcpus = request.param vm = microvm_factory.build(guest_kernel_acpi, rootfs, monitor_memory=False) - vm.spawn(log_level="Info", emit_metrics=True) + vm.spawn(log_level="Info", emit_metrics=True, pci=pci_enabled) vm.basic_config(vcpu_count=guest_vcpus, mem_size_mib=guest_mem_mib) vm.add_net_iface() vm.start() diff --git a/tests/integration_tests/performance/test_vsock.py b/tests/integration_tests/performance/test_vsock.py index bad4436e568..5a023f53eea 100644 --- a/tests/integration_tests/performance/test_vsock.py +++ b/tests/integration_tests/performance/test_vsock.py @@ -80,6 +80,7 @@ def test_vsock_throughput( rootfs, vcpus, payload_length, + pci_enabled, mode, metrics, results_dir, @@ -95,7 +96,7 @@ def test_vsock_throughput( mem_size_mib = 1024 vm = microvm_factory.build(guest_kernel_acpi, rootfs, monitor_memory=False) - vm.spawn(log_level="Info", emit_metrics=True) + vm.spawn(log_level="Info", emit_metrics=True, pci=pci_enabled) vm.basic_config(vcpu_count=vcpus, mem_size_mib=mem_size_mib) vm.add_net_iface() # Create a vsock device From 4183d150e01bf781cd54fcdad252fbeead6425fb Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 25 Jun 2025 10:00:15 +0200 Subject: [PATCH 52/99] test: remove pci=off default from various parts in tests We only pass pci=off if PCI is disabled in Firecracker. Adapt tests and comments to reflect that. Signed-off-by: Babis Chalios --- tests/framework/microvm.py | 6 ++++-- tests/integration_tests/performance/test_boottime.py | 5 +++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py index 3bb15a5f449..3858886585a 100644 --- a/tests/framework/microvm.py +++ b/tests/framework/microvm.py @@ -807,8 +807,10 @@ def basic_config( the response is within the interval [200, 300). If boot_args is None, the default boot_args in Firecracker is - reboot=k panic=1 pci=off nomodule 8250.nr_uarts=0 - i8042.noaux i8042.nomux i8042.nopnp i8042.dumbkbd + reboot=k panic=1 nomodule 8250.nr_uarts=0 i8042.noaux i8042.nomux + i8042.nopnp i8042.dumbkbd + + if PCI is disabled, Firecracker also passes to the guest pci=off Reference: file:../../src/vmm/src/vmm_config/boot_source.rs::DEFAULT_KERNEL_CMDLINE """ diff --git a/tests/integration_tests/performance/test_boottime.py b/tests/integration_tests/performance/test_boottime.py index c5d67470f1a..7b7bd2a506a 100644 --- a/tests/integration_tests/performance/test_boottime.py +++ b/tests/integration_tests/performance/test_boottime.py @@ -11,7 +11,7 @@ # Regex for obtaining boot time from some string. DEFAULT_BOOT_ARGS = ( - "reboot=k panic=1 pci=off nomodule 8250.nr_uarts=0" + "reboot=k panic=1 nomodule 8250.nr_uarts=0" " i8042.noaux i8042.nomux i8042.nopnp i8042.dumbkbd" ) @@ -98,13 +98,14 @@ def launch_vm_with_boot_timer( microvm_factory, guest_kernel_acpi, rootfs_rw, vcpu_count, mem_size_mib, pci_enabled ): """Launches a microVM with guest-timer and returns the reported metrics for it""" + boot_args = DEFAULT_BOOT_ARGS if pci_enabled else DEFAULT_BOOT_ARGS + " pci=off" vm = microvm_factory.build(guest_kernel_acpi, rootfs_rw) vm.jailer.extra_args.update({"boot-timer": None}) vm.spawn(pci=pci_enabled) vm.basic_config( vcpu_count=vcpu_count, mem_size_mib=mem_size_mib, - boot_args=DEFAULT_BOOT_ARGS + " init=/usr/local/bin/init", + boot_args=boot_args + " init=/usr/local/bin/init", enable_entropy_device=True, ) vm.add_net_iface() From f60247b881baa60fed55b9151b7c6223dcff5d0a Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Mon, 30 Jun 2025 12:26:48 +0200 Subject: [PATCH 53/99] virtio: add kick() method in VirtioDevice trait So that we don't have to downcast VirtioDevice trait objects to the actual device type before calling the logic to process events for each device. Signed-off-by: Babis Chalios --- src/vmm/src/device_manager/mod.rs | 89 +++----------------- src/vmm/src/devices/virtio/balloon/device.rs | 12 ++- src/vmm/src/devices/virtio/block/device.rs | 13 +++ src/vmm/src/devices/virtio/device.rs | 3 + src/vmm/src/devices/virtio/net/device.rs | 13 ++- src/vmm/src/devices/virtio/rng/device.rs | 8 ++ src/vmm/src/devices/virtio/vsock/device.rs | 15 +++- 7 files changed, 74 insertions(+), 79 deletions(-) diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index 2135711ea54..34d1ba73091 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -30,14 +30,8 @@ use crate::devices::legacy::RTCDevice; use crate::devices::legacy::serial::SerialOut; use crate::devices::legacy::{IER_RDA_BIT, IER_RDA_OFFSET, SerialDevice}; use crate::devices::pseudo::BootTimer; -use crate::devices::virtio::balloon::Balloon; -use crate::devices::virtio::block::device::Block; use crate::devices::virtio::device::VirtioDevice; -use crate::devices::virtio::net::Net; -use crate::devices::virtio::rng::Entropy; use crate::devices::virtio::transport::mmio::{IrqTrigger, MmioTransport}; -use crate::devices::virtio::vsock::{TYPE_VSOCK, Vsock, VsockUnixBackend}; -use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET, TYPE_RNG}; use crate::resources::VmResources; use crate::snapshot::Persist; use crate::vstate::memory::GuestMemoryMmap; @@ -265,85 +259,28 @@ impl DeviceManager { self.pci_devices.attach_pci_segment(vm) } - fn do_kick_device(virtio_device: Arc>) { - let mut device = virtio_device.lock().expect("Poisoned lock"); - match device.device_type() { - TYPE_BALLOON => { - let balloon = device.as_mut_any().downcast_mut::().unwrap(); - // If device is activated, kick the balloon queue(s) to make up for any - // pending or in-flight epoll events we may have not captured in snapshot. - // Stats queue doesn't need kicking as it is notified via a `timer_fd`. - if balloon.is_activated() { - info!("kick balloon {}.", balloon.id()); - balloon.process_virtio_queues().unwrap(); - } - } - TYPE_BLOCK => { - // We only care about kicking virtio block. - // If we need to kick vhost-user-block we can do nothing. - if let Some(block) = device.as_mut_any().downcast_mut::() { - // If device is activated, kick the block queue(s) to make up for any - // pending or in-flight epoll events we may have not captured in - // snapshot. No need to kick Ratelimiters - // because they are restored 'unblocked' so - // any inflight `timer_fd` events can be safely discarded. - if block.is_activated() { - info!("kick block {}.", block.id()); - block.process_virtio_queues().unwrap(); - } - } - } - TYPE_NET => { - let net = device.as_mut_any().downcast_mut::().unwrap(); - // If device is activated, kick the net queue(s) to make up for any - // pending or in-flight epoll events we may have not captured in snapshot. - // No need to kick Ratelimiters because they are restored 'unblocked' so - // any inflight `timer_fd` events can be safely discarded. - if net.is_activated() { - info!("kick net {}.", net.id()); - net.process_virtio_queues().unwrap(); - } - } - TYPE_VSOCK => { - // Vsock has complicated protocol that isn't resilient to any packet loss, - // so for Vsock we don't support connection persistence through snapshot. - // Any in-flight packets or events are simply lost. - // Vsock is restored 'empty'. - // The only reason we still `kick` it is to make guest process - // `TRANSPORT_RESET_EVENT` event we sent during snapshot creation. - let vsock = device - .as_mut_any() - .downcast_mut::>() - .unwrap(); - if vsock.is_activated() { - info!("kick vsock {}.", vsock.id()); - vsock.signal_used_queue(0).unwrap(); - } - } - TYPE_RNG => { - let entropy = device.as_mut_any().downcast_mut::().unwrap(); - if entropy.is_activated() { - info!("kick entropy {}.", entropy.id()); - entropy.process_virtio_queues().unwrap(); - } - } - _ => (), - } - } - /// Artificially kick VirtIO devices as if they had external events. pub fn kick_virtio_devices(&self) { info!("Artificially kick devices"); // Go through MMIO VirtIO devices let _: Result<(), MmioError> = self.mmio_devices.for_each_virtio_device(|_, _, device| { let mmio_transport_locked = device.inner.lock().expect("Poisoned lock"); - Self::do_kick_device(mmio_transport_locked.device()); + mmio_transport_locked + .device() + .lock() + .expect("Poisoned lock") + .kick(); Ok(()) }); // Go through PCI VirtIO devices - for device in self.pci_devices.virtio_devices.values() { - let virtio_device = device.lock().expect("Poisoned lock").virtio_device(); - Self::do_kick_device(virtio_device); + for virtio_pci_device in self.pci_devices.virtio_devices.values() { + virtio_pci_device + .lock() + .expect("Poisoned lock") + .virtio_device() + .lock() + .expect("Poisoned lock") + .kick(); } } diff --git a/src/vmm/src/devices/virtio/balloon/device.rs b/src/vmm/src/devices/virtio/balloon/device.rs index 3cfcbed4465..4586592182c 100644 --- a/src/vmm/src/devices/virtio/balloon/device.rs +++ b/src/vmm/src/devices/virtio/balloon/device.rs @@ -5,7 +5,7 @@ use std::ops::Deref; use std::sync::Arc; use std::time::Duration; -use log::error; +use log::{error, info}; use serde::Serialize; use timerfd::{ClockId, SetTimeFlags, TimerFd, TimerState}; use vmm_sys_util::eventfd::EventFd; @@ -621,6 +621,16 @@ impl VirtioDevice for Balloon { fn is_activated(&self) -> bool { self.device_state.is_activated() } + + fn kick(&mut self) { + // If device is activated, kick the balloon queue(s) to make up for any + // pending or in-flight epoll events we may have not captured in snapshot. + // Stats queue doesn't need kicking as it is notified via a `timer_fd`. + if self.is_activated() { + info!("kick balloon {}.", self.id()); + self.process_virtio_queues(); + } + } } #[cfg(test)] diff --git a/src/vmm/src/devices/virtio/block/device.rs b/src/vmm/src/devices/virtio/block/device.rs index d58550acc59..c1fa95f7b1c 100644 --- a/src/vmm/src/devices/virtio/block/device.rs +++ b/src/vmm/src/devices/virtio/block/device.rs @@ -4,6 +4,7 @@ use std::sync::Arc; use event_manager::{EventOps, Events, MutEventSubscriber}; +use log::info; use vmm_sys_util::eventfd::EventFd; use super::BlockError; @@ -214,6 +215,18 @@ impl VirtioDevice for Block { Self::VhostUser(b) => b.device_state.is_activated(), } } + + fn kick(&mut self) { + // If device is activated, kick the block queue(s) to make up for any + // pending or in-flight epoll events we may have not captured in + // snapshot. No need to kick Ratelimiters + // because they are restored 'unblocked' so + // any inflight `timer_fd` events can be safely discarded. + if self.is_activated() { + info!("kick block {}.", self.id()); + self.process_virtio_queues(); + } + } } impl MutEventSubscriber for Block { diff --git a/src/vmm/src/devices/virtio/device.rs b/src/vmm/src/devices/virtio/device.rs index 7b51a4b1dbf..ca3efc8cf2f 100644 --- a/src/vmm/src/devices/virtio/device.rs +++ b/src/vmm/src/devices/virtio/device.rs @@ -159,6 +159,9 @@ pub trait VirtioDevice: AsAny + Send { } Ok(()) } + + /// Kick the device, as if it had received external events. + fn kick(&mut self) {} } impl fmt::Debug for dyn VirtioDevice { diff --git a/src/vmm/src/devices/virtio/net/device.rs b/src/vmm/src/devices/virtio/net/device.rs index 4c6022a0067..0b2f3150c09 100755 --- a/src/vmm/src/devices/virtio/net/device.rs +++ b/src/vmm/src/devices/virtio/net/device.rs @@ -13,7 +13,7 @@ use std::ops::Deref; use std::sync::{Arc, Mutex}; use libc::{EAGAIN, iovec}; -use log::error; +use log::{error, info}; use vmm_sys_util::eventfd::EventFd; use super::NET_QUEUE_MAX_SIZE; @@ -1059,6 +1059,17 @@ impl VirtioDevice for Net { fn is_activated(&self) -> bool { self.device_state.is_activated() } + + fn kick(&mut self) { + // If device is activated, kick the net queue(s) to make up for any + // pending or in-flight epoll events we may have not captured in snapshot. + // No need to kick Ratelimiters because they are restored 'unblocked' so + // any inflight `timer_fd` events can be safely discarded. + if self.is_activated() { + info!("kick net {}.", self.id()); + self.process_virtio_queues(); + } + } } #[cfg(test)] diff --git a/src/vmm/src/devices/virtio/rng/device.rs b/src/vmm/src/devices/virtio/rng/device.rs index a0b98cdc8b7..2cf1c6bf5dd 100644 --- a/src/vmm/src/devices/virtio/rng/device.rs +++ b/src/vmm/src/devices/virtio/rng/device.rs @@ -6,6 +6,7 @@ use std::ops::Deref; use std::sync::Arc; use aws_lc_rs::rand; +use log::info; use vm_memory::GuestMemoryError; use vmm_sys_util::eventfd::EventFd; @@ -312,6 +313,13 @@ impl VirtioDevice for Entropy { self.device_state = DeviceState::Activated(ActiveState { mem, interrupt }); Ok(()) } + + fn kick(&mut self) { + if self.is_activated() { + info!("kick entropy {}.", self.id()); + self.process_virtio_queues(); + } + } } #[cfg(test)] diff --git a/src/vmm/src/devices/virtio/vsock/device.rs b/src/vmm/src/devices/virtio/vsock/device.rs index 61ca3246d43..bef7fd0af4c 100644 --- a/src/vmm/src/devices/virtio/vsock/device.rs +++ b/src/vmm/src/devices/virtio/vsock/device.rs @@ -24,7 +24,7 @@ use std::fmt::Debug; use std::ops::Deref; use std::sync::Arc; -use log::{error, warn}; +use log::{error, info, warn}; use vmm_sys_util::eventfd::EventFd; use super::super::super::DeviceError; @@ -368,6 +368,19 @@ where fn is_activated(&self) -> bool { self.device_state.is_activated() } + + fn kick(&mut self) { + // Vsock has complicated protocol that isn't resilient to any packet loss, + // so for Vsock we don't support connection persistence through snapshot. + // Any in-flight packets or events are simply lost. + // Vsock is restored 'empty'. + // The only reason we still `kick` it is to make guest process + // `TRANSPORT_RESET_EVENT` event we sent during snapshot creation. + if self.is_activated() { + info!("kick vsock {}.", self.id()); + self.signal_used_queue(0).unwrap(); + } + } } #[cfg(test)] From 5bf30615ae044ad77bc2e3cd773ca65109219a8a Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Mon, 30 Jun 2025 18:33:40 +0200 Subject: [PATCH 54/99] refactor: simplify ResourceAllocator internals Instead of storing internal allocators of ResourceAllocator within an Arc> container, just store `ResourceAllocator` itself in an `Arc>`. Apart from that, we get rid of the `ResourceAllocatorState` state object, and just clone `ResourceAllocator` itself when we want to save/restore. Also, make the creation of `ResourceAllocato` infallible, since we know that the ranges we are using are correct. Finally, fix saving/restoring the state of ResourceAllocator. We were actually not resetting it correctly upon snapshot restore. The reason why this was not a problem is that we don't actually need to perform any new allocations post restore at the moment. However, like this we are ready when we need to perform any hot-plugging operations. Also, add a unit-test to ensure that this logic works correctly. Signed-off-by: Babis Chalios --- src/pci/Cargo.toml | 2 +- src/vmm/src/acpi/mod.rs | 33 +++---- src/vmm/src/arch/aarch64/vm.rs | 7 ++ src/vmm/src/arch/x86_64/mod.rs | 20 ++-- src/vmm/src/arch/x86_64/mptable.rs | 31 +++--- src/vmm/src/arch/x86_64/vm.rs | 9 +- src/vmm/src/builder.rs | 6 +- src/vmm/src/device_manager/mmio.rs | 20 ++-- src/vmm/src/device_manager/mod.rs | 2 +- src/vmm/src/device_manager/pci_mngr.rs | 21 ++-- src/vmm/src/device_manager/persist.rs | 2 +- src/vmm/src/devices/acpi/vmgenid.rs | 4 +- src/vmm/src/devices/pci/pci_segment.rs | 11 +-- src/vmm/src/vstate/resources.rs | 127 ++++++++----------------- src/vmm/src/vstate/vm.rs | 58 ++++++++++- 15 files changed, 182 insertions(+), 171 deletions(-) diff --git a/src/pci/Cargo.toml b/src/pci/Cargo.toml index 3549d5010fe..d179854f391 100644 --- a/src/pci/Cargo.toml +++ b/src/pci/Cargo.toml @@ -18,7 +18,7 @@ libc = "0.2.172" log = "0.4.27" serde = { version = "1.0.219", features = ["derive"] } thiserror = "2.0.12" -vm-allocator = "0.1.2" +vm-allocator = "0.1.3" vm-device = { path = "../vm-device" } vm-memory = { version = "0.16.1", features = [ "backend-mmap", diff --git a/src/vmm/src/acpi/mod.rs b/src/vmm/src/acpi/mod.rs index 51711d9eb92..f3b4164745a 100644 --- a/src/vmm/src/acpi/mod.rs +++ b/src/vmm/src/acpi/mod.rs @@ -54,7 +54,7 @@ impl AcpiTableWriter<'_> { /// buffer. It returns the address in which it wrote the table. fn write_acpi_table( &mut self, - resource_allocator: &ResourceAllocator, + resource_allocator: &mut ResourceAllocator, table: &mut S, ) -> Result where @@ -83,7 +83,7 @@ impl AcpiTableWriter<'_> { fn build_dsdt( &mut self, device_manager: &mut DeviceManager, - resource_allocator: &ResourceAllocator, + resource_allocator: &mut ResourceAllocator, ) -> Result { let mut dsdt_data = Vec::new(); @@ -111,7 +111,7 @@ impl AcpiTableWriter<'_> { /// This includes a pointer with the location of the DSDT in guest memory fn build_fadt( &mut self, - resource_allocator: &ResourceAllocator, + resource_allocator: &mut ResourceAllocator, dsdt_addr: u64, ) -> Result { let mut fadt = Fadt::new(OEM_ID, *b"FCVMFADT", OEM_REVISION); @@ -129,7 +129,7 @@ impl AcpiTableWriter<'_> { /// This includes information about the interrupt controllers supported in the platform fn build_madt( &mut self, - resource_allocator: &ResourceAllocator, + resource_allocator: &mut ResourceAllocator, nr_vcpus: u8, ) -> Result { let mut madt = Madt::new( @@ -147,7 +147,7 @@ impl AcpiTableWriter<'_> { /// Currently, we pass to the guest just FADT and MADT tables. fn build_xsdt( &mut self, - resource_allocator: &ResourceAllocator, + resource_allocator: &mut ResourceAllocator, fadt_addr: u64, madt_addr: u64, mcfg_addr: u64, @@ -164,7 +164,7 @@ impl AcpiTableWriter<'_> { /// Build the MCFG table for the guest. fn build_mcfg( &mut self, - resource_allocator: &ResourceAllocator, + resource_allocator: &mut ResourceAllocator, pci_mmio_config_addr: u64, ) -> Result { let mut mcfg = Mcfg::new(OEM_ID, *b"FCMVMCFG", OEM_REVISION, pci_mmio_config_addr); @@ -197,7 +197,7 @@ impl AcpiTableWriter<'_> { pub(crate) fn create_acpi_tables( mem: &GuestMemoryMmap, device_manager: &mut DeviceManager, - resource_allocator: &ResourceAllocator, + resource_allocator: &mut ResourceAllocator, vcpus: &[Vcpu], ) -> Result<(), AcpiError> { let mut writer = AcpiTableWriter { mem }; @@ -249,18 +249,19 @@ mod tests { let mut writer = AcpiTableWriter { mem: vmm.vm.guest_memory(), }; + let mut resource_allocator = vmm.vm.resource_allocator(); // This should succeed let mut sdt = MockSdt(vec![0; 4096]); let addr = writer - .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) + .write_acpi_table(&mut resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START); // Let's try to write two 4K pages plus one byte let mut sdt = MockSdt(vec![0; usize::try_from(SYSTEM_MEM_SIZE + 1).unwrap()]); let err = writer - .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) + .write_acpi_table(&mut resource_allocator, &mut sdt) .unwrap_err(); assert!( matches!( @@ -275,27 +276,27 @@ mod tests { // succeed. let mut sdt = MockSdt(vec![0; 5]); let addr = writer - .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) + .write_acpi_table(&mut resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4096); let mut sdt = MockSdt(vec![0; 2]); let addr = writer - .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) + .write_acpi_table(&mut resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4101); let mut sdt = MockSdt(vec![0; 4]); let addr = writer - .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) + .write_acpi_table(&mut resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4103); let mut sdt = MockSdt(vec![0; 8]); let addr = writer - .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) + .write_acpi_table(&mut resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4107); let mut sdt = MockSdt(vec![0; 16]); let addr = writer - .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) + .write_acpi_table(&mut resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4115); } @@ -312,11 +313,11 @@ mod tests { let mut writer = AcpiTableWriter { mem: vm.guest_memory(), }; - let resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); let mut sdt = MockSdt(vec![0; usize::try_from(SYSTEM_MEM_SIZE).unwrap()]); let err = writer - .write_acpi_table(&resource_allocator, &mut sdt) + .write_acpi_table(&mut resource_allocator, &mut sdt) .unwrap_err(); assert!( matches!( diff --git a/src/vmm/src/arch/aarch64/vm.rs b/src/vmm/src/arch/aarch64/vm.rs index e54723f5b6d..eaec0932a42 100644 --- a/src/vmm/src/arch/aarch64/vm.rs +++ b/src/vmm/src/arch/aarch64/vm.rs @@ -1,11 +1,14 @@ // Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +use std::sync::Mutex; + use serde::{Deserialize, Serialize}; use crate::Kvm; use crate::arch::aarch64::gic::GicState; use crate::vstate::memory::{GuestMemoryExtension, GuestMemoryState}; +use crate::vstate::resources::ResourceAllocator; use crate::vstate::vm::{VmCommon, VmError}; /// Structure representing the current architecture's understand of what a "virtual machine" is. @@ -74,6 +77,7 @@ impl ArchVm { .get_irqchip() .save_device(mpidrs) .map_err(ArchVmError::SaveGic)?, + resource_allocator: self.resource_allocator().clone(), }) } @@ -86,6 +90,7 @@ impl ArchVm { self.get_irqchip() .restore_device(mpidrs, &state.gic) .map_err(ArchVmError::RestoreGic)?; + self.common.resource_allocator = Mutex::new(state.resource_allocator.clone()); Ok(()) } @@ -98,4 +103,6 @@ pub struct VmState { pub memory: GuestMemoryState, /// GIC state. pub gic: GicState, + /// resource allocator + pub resource_allocator: ResourceAllocator, } diff --git a/src/vmm/src/arch/x86_64/mod.rs b/src/vmm/src/arch/x86_64/mod.rs index 5307dbdf710..1822abb9009 100644 --- a/src/vmm/src/arch/x86_64/mod.rs +++ b/src/vmm/src/arch/x86_64/mod.rs @@ -217,7 +217,7 @@ pub fn configure_system_for_boot( // Note that this puts the mptable at the last 1k of Linux's 640k base RAM mptable::setup_mptable( vm.guest_memory(), - &vm.common.resource_allocator, + &mut vm.resource_allocator(), vcpu_config.vcpu_count, ) .map_err(ConfigurationError::MpTableSetup)?; @@ -241,7 +241,7 @@ pub fn configure_system_for_boot( create_acpi_tables( vm.guest_memory(), device_manager, - &vm.common.resource_allocator, + &mut vm.resource_allocator(), vcpus, )?; Ok(()) @@ -607,8 +607,8 @@ mod tests { fn test_system_configuration() { let no_vcpus = 4; let gm = single_region_mem(0x10000); - let resource_allocator = ResourceAllocator::new().unwrap(); - let err = mptable::setup_mptable(&gm, &resource_allocator, 1); + let mut resource_allocator = ResourceAllocator::new(); + let err = mptable::setup_mptable(&gm, &mut resource_allocator, 1); assert!(matches!( err.unwrap_err(), mptable::MptableError::NotEnoughMemory @@ -617,24 +617,24 @@ mod tests { // Now assigning some memory that falls before the 32bit memory hole. let mem_size = mib_to_bytes(128); let gm = arch_mem(mem_size); - let resource_allocator = ResourceAllocator::new().unwrap(); - mptable::setup_mptable(&gm, &resource_allocator, no_vcpus).unwrap(); + let mut resource_allocator = ResourceAllocator::new(); + mptable::setup_mptable(&gm, &mut resource_allocator, no_vcpus).unwrap(); configure_64bit_boot(&gm, GuestAddress(0), 0, &None).unwrap(); configure_pvh(&gm, GuestAddress(0), &None).unwrap(); // Now assigning some memory that is equal to the start of the 32bit memory hole. let mem_size = mib_to_bytes(3328); let gm = arch_mem(mem_size); - let resource_allocator = ResourceAllocator::new().unwrap(); - mptable::setup_mptable(&gm, &resource_allocator, no_vcpus).unwrap(); + let mut resource_allocator = ResourceAllocator::new(); + mptable::setup_mptable(&gm, &mut resource_allocator, no_vcpus).unwrap(); configure_64bit_boot(&gm, GuestAddress(0), 0, &None).unwrap(); configure_pvh(&gm, GuestAddress(0), &None).unwrap(); // Now assigning some memory that falls after the 32bit memory hole. let mem_size = mib_to_bytes(3330); let gm = arch_mem(mem_size); - let resource_allocator = ResourceAllocator::new().unwrap(); - mptable::setup_mptable(&gm, &resource_allocator, no_vcpus).unwrap(); + let mut resource_allocator = ResourceAllocator::new(); + mptable::setup_mptable(&gm, &mut resource_allocator, no_vcpus).unwrap(); configure_64bit_boot(&gm, GuestAddress(0), 0, &None).unwrap(); configure_pvh(&gm, GuestAddress(0), &None).unwrap(); } diff --git a/src/vmm/src/arch/x86_64/mptable.rs b/src/vmm/src/arch/x86_64/mptable.rs index 17b2900aeb2..a4b1e2fa632 100644 --- a/src/vmm/src/arch/x86_64/mptable.rs +++ b/src/vmm/src/arch/x86_64/mptable.rs @@ -116,7 +116,7 @@ fn compute_mp_size(num_cpus: u8) -> usize { /// Performs setup of the MP table for the given `num_cpus`. pub fn setup_mptable( mem: &GuestMemoryMmap, - resource_allocator: &ResourceAllocator, + resource_allocator: &mut ResourceAllocator, num_cpus: u8, ) -> Result<(), MptableError> { if num_cpus > MAX_SUPPORTED_CPUS { @@ -334,27 +334,27 @@ mod tests { fn bounds_check() { let num_cpus = 4; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(num_cpus)); - let resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); - setup_mptable(&mem, &resource_allocator, num_cpus).unwrap(); + setup_mptable(&mem, &mut resource_allocator, num_cpus).unwrap(); } #[test] fn bounds_check_fails() { let num_cpus = 4; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(num_cpus) - 1); - let resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); - setup_mptable(&mem, &resource_allocator, num_cpus).unwrap_err(); + setup_mptable(&mem, &mut resource_allocator, num_cpus).unwrap_err(); } #[test] fn mpf_intel_checksum() { let num_cpus = 1; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(num_cpus)); - let resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); - setup_mptable(&mem, &resource_allocator, num_cpus).unwrap(); + setup_mptable(&mem, &mut resource_allocator, num_cpus).unwrap(); let mpf_intel: mpspec::mpf_intel = mem.read_obj(GuestAddress(SYSTEM_MEM_START)).unwrap(); @@ -365,9 +365,9 @@ mod tests { fn mpc_table_checksum() { let num_cpus = 4; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(num_cpus)); - let resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); - setup_mptable(&mem, &resource_allocator, num_cpus).unwrap(); + setup_mptable(&mem, &mut resource_allocator, num_cpus).unwrap(); let mpf_intel: mpspec::mpf_intel = mem.read_obj(GuestAddress(SYSTEM_MEM_START)).unwrap(); let mpc_offset = GuestAddress(u64::from(mpf_intel.physptr)); @@ -388,9 +388,9 @@ mod tests { fn mpc_entry_count() { let num_cpus = 1; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(num_cpus)); - let resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); - setup_mptable(&mem, &resource_allocator, num_cpus).unwrap(); + setup_mptable(&mem, &mut resource_allocator, num_cpus).unwrap(); let mpf_intel: mpspec::mpf_intel = mem.read_obj(GuestAddress(SYSTEM_MEM_START)).unwrap(); let mpc_offset = GuestAddress(u64::from(mpf_intel.physptr)); @@ -419,8 +419,9 @@ mod tests { let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(MAX_SUPPORTED_CPUS)); for i in 0..MAX_SUPPORTED_CPUS { - let resource_allocator = ResourceAllocator::new().unwrap(); - setup_mptable(&mem, &resource_allocator, i).unwrap(); + let mut resource_allocator = ResourceAllocator::new(); + + setup_mptable(&mem, &mut resource_allocator, i).unwrap(); let mpf_intel: mpspec::mpf_intel = mem.read_obj(GuestAddress(SYSTEM_MEM_START)).unwrap(); @@ -450,9 +451,9 @@ mod tests { fn cpu_entry_count_max() { let cpus = MAX_SUPPORTED_CPUS + 1; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(cpus)); - let resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); - let result = setup_mptable(&mem, &resource_allocator, cpus).unwrap_err(); + let result = setup_mptable(&mem, &mut resource_allocator, cpus).unwrap_err(); assert_eq!(result, MptableError::TooManyCpus); } } diff --git a/src/vmm/src/arch/x86_64/vm.rs b/src/vmm/src/arch/x86_64/vm.rs index fbc27c82a60..e194296928d 100644 --- a/src/vmm/src/arch/x86_64/vm.rs +++ b/src/vmm/src/arch/x86_64/vm.rs @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use std::fmt; -use std::sync::Arc; +use std::sync::{Arc, Mutex}; use kvm_bindings::{ KVM_CLOCK_TSC_STABLE, KVM_IRQCHIP_IOAPIC, KVM_IRQCHIP_PIC_MASTER, KVM_IRQCHIP_PIC_SLAVE, @@ -15,7 +15,7 @@ use crate::arch::x86_64::msr::MsrError; use crate::snapshot::Persist; use crate::utils::u64_to_usize; use crate::vstate::memory::{GuestMemoryExtension, GuestMemoryState}; -use crate::vstate::resources::ResourceAllocatorState; +use crate::vstate::resources::ResourceAllocator; use crate::vstate::vm::{VmCommon, VmError}; /// Error type for [`Vm::restore_state`] @@ -142,6 +142,7 @@ impl ArchVm { self.fd() .set_irqchip(&state.ioapic) .map_err(ArchVmError::SetIrqChipIoAPIC)?; + self.common.resource_allocator = Mutex::new(state.resource_allocator.clone()); Ok(()) } @@ -195,7 +196,7 @@ impl ArchVm { Ok(VmState { memory: self.common.guest_memory.describe(), - resource_allocator: self.common.resource_allocator.save(), + resource_allocator: self.resource_allocator().save(), pitstate, clock, pic_master, @@ -221,7 +222,7 @@ pub struct VmState { /// guest memory state pub memory: GuestMemoryState, /// resource allocator - pub resource_allocator: ResourceAllocatorState, + pub resource_allocator: ResourceAllocator, pitstate: kvm_pit_state2, clock: kvm_clock_data, // TODO: rename this field to adopt inclusive language once Linux updates it, too. diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index f2da5a6d045..88d7f56cb4e 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -254,7 +254,7 @@ pub fn build_microvm_for_boot( #[cfg(target_arch = "aarch64")] if vcpus[0].kvm_vcpu.supports_pvtime() { - setup_pvtime(&vm.common.resource_allocator, &mut vcpus)?; + setup_pvtime(&mut vm.resource_allocator(), &mut vcpus)?; } else { log::warn!("Vcpus do not support pvtime, steal time will not be reported to guest"); } @@ -515,7 +515,7 @@ const STEALTIME_STRUCT_MEM_SIZE: u64 = 64; /// Helper method to allocate steal time region #[cfg(target_arch = "aarch64")] fn allocate_pvtime_region( - resource_allocator: &ResourceAllocator, + resource_allocator: &mut ResourceAllocator, vcpu_count: usize, policy: vm_allocator::AllocPolicy, ) -> Result { @@ -529,7 +529,7 @@ fn allocate_pvtime_region( /// Sets up pvtime for all vcpus #[cfg(target_arch = "aarch64")] fn setup_pvtime( - resource_allocator: &ResourceAllocator, + resource_allocator: &mut ResourceAllocator, vcpus: &mut [Vcpu], ) -> Result<(), StartMicrovmError> { // Alloc sys mem for steal time region diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 6b3ea95a5b5..d0c116ce20d 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -153,7 +153,7 @@ impl MMIODeviceManager { /// Allocates resources for a new device to be added. fn allocate_mmio_resources( &mut self, - resource_allocator: &ResourceAllocator, + resource_allocator: &mut ResourceAllocator, irq_count: u32, ) -> Result { let irq = match resource_allocator.allocate_gsi(irq_count)?[..] { @@ -242,7 +242,7 @@ impl MMIODeviceManager { _cmdline: &mut kernel_cmdline::Cmdline, ) -> Result<(), MmioError> { let device = MMIODevice { - resources: self.allocate_mmio_resources(&vm.common.resource_allocator, 1)?, + resources: self.allocate_mmio_resources(&mut vm.resource_allocator(), 1)?, inner: Arc::new(Mutex::new(mmio_device)), }; @@ -276,7 +276,7 @@ impl MMIODeviceManager { let device_info = if let Some(device_info) = device_info_opt { device_info } else { - let gsi = vm.common.resource_allocator.allocate_gsi(1)?; + let gsi = vm.resource_allocator().allocate_gsi(1)?; MMIODeviceInfo { addr: SERIAL_MEM_START, len: MMIO_LEN, @@ -335,7 +335,7 @@ impl MMIODeviceManager { let device_info = if let Some(device_info) = device_info_opt { device_info } else { - let gsi = vm.common.resource_allocator.allocate_gsi(1)?; + let gsi = vm.resource_allocator().allocate_gsi(1)?; MMIODeviceInfo { addr: RTC_MEM_START, len: MMIO_LEN, @@ -746,10 +746,10 @@ pub(crate) mod tests { #[test] fn test_no_irq_allocation() { let mut device_manager = MMIODeviceManager::new(); - let resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); let device_info = device_manager - .allocate_mmio_resources(&resource_allocator, 0) + .allocate_mmio_resources(&mut resource_allocator, 0) .unwrap(); assert!(device_info.irq.is_none()); } @@ -757,10 +757,10 @@ pub(crate) mod tests { #[test] fn test_irq_allocation() { let mut device_manager = MMIODeviceManager::new(); - let resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); let device_info = device_manager - .allocate_mmio_resources(&resource_allocator, 1) + .allocate_mmio_resources(&mut resource_allocator, 1) .unwrap(); assert_eq!(device_info.irq.unwrap(), crate::arch::GSI_BASE); } @@ -768,12 +768,12 @@ pub(crate) mod tests { #[test] fn test_allocation_failure() { let mut device_manager = MMIODeviceManager::new(); - let resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); assert_eq!( format!( "{}", device_manager - .allocate_mmio_resources(&resource_allocator, 2) + .allocate_mmio_resources(&mut resource_allocator, 2) .unwrap_err() ), "Invalid MMIO IRQ configuration.".to_string() diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index 34d1ba73091..cfc7fe44d79 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -221,7 +221,7 @@ impl DeviceManager { mem: &GuestMemoryMmap, vm: &Vm, ) -> Result<(), AttachDeviceError> { - let vmgenid = VmGenId::new(mem, &vm.common.resource_allocator)?; + let vmgenid = VmGenId::new(mem, &mut vm.resource_allocator())?; self.acpi_devices.attach_vmgenid(vmgenid, vm)?; Ok(()) } diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs index 4d3a6ebf0f7..0727f76d269 100644 --- a/src/vmm/src/device_manager/pci_mngr.rs +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -3,6 +3,7 @@ use std::collections::HashMap; use std::fmt::Debug; +use std::ops::DerefMut; use std::sync::{Arc, Mutex}; use event_manager::{MutEventSubscriber, SubscriberOps}; @@ -130,7 +131,7 @@ impl PciDevices { let pci_device_bdf = pci_segment.next_device_bdf()?; debug!("Allocating BDF: {pci_device_bdf:?} for device"); let mem = vm.guest_memory().clone(); - let resource_allocator = &vm.common.resource_allocator; + let device_type: u32 = device.lock().expect("Poisoned lock").device_type(); // Allocate one MSI vector per queue, plus one for configuration @@ -144,16 +145,14 @@ impl PciDevices { VirtioPciDevice::new(id.clone(), mem, device, msix_vectors, pci_device_bdf.into())?; // Allocate bars - let mut mmio32_allocator = resource_allocator - .mmio32_memory - .lock() - .expect("Poisoned lock"); - let mut mmio64_allocator = resource_allocator - .mmio64_memory - .lock() - .expect("Poisoned lock"); - - virtio_device.allocate_bars(&mut mmio32_allocator, &mut mmio64_allocator, None)?; + let mut resource_allocator_lock = vm.resource_allocator(); + let resource_allocator = resource_allocator_lock.deref_mut(); + + virtio_device.allocate_bars( + &mut resource_allocator.mmio32_memory, + &mut resource_allocator.mmio64_memory, + None, + )?; let virtio_device = Arc::new(Mutex::new(virtio_device)); pci_segment diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index fca94595372..74e71f3a6bf 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -201,7 +201,7 @@ impl<'a> Persist<'a> for ACPIDeviceManager { let vmgenid = VmGenId::restore( VMGenIdConstructorArgs { mem: constructor_args.mem, - resource_allocator: &constructor_args.vm.common.resource_allocator, + resource_allocator: &mut constructor_args.vm.resource_allocator(), }, vmgenid_args, )?; diff --git a/src/vmm/src/devices/acpi/vmgenid.rs b/src/vmm/src/devices/acpi/vmgenid.rs index 5c8d4ecbc51..6d096007193 100644 --- a/src/vmm/src/devices/acpi/vmgenid.rs +++ b/src/vmm/src/devices/acpi/vmgenid.rs @@ -86,7 +86,7 @@ impl VmGenId { /// Allocate memory and a GSI for sending notifications and build the device pub fn new( mem: &GuestMemoryMmap, - resource_allocator: &ResourceAllocator, + resource_allocator: &mut ResourceAllocator, ) -> Result { let gsi = resource_allocator.allocate_gsi(1)?; // The generation ID needs to live in an 8-byte aligned buffer @@ -133,7 +133,7 @@ pub struct VMGenIDState { #[derive(Debug)] pub struct VMGenIdConstructorArgs<'a> { pub mem: &'a GuestMemoryMmap, - pub resource_allocator: &'a ResourceAllocator, + pub resource_allocator: &'a mut ResourceAllocator, } impl<'a> Persist<'a> for VmGenId { diff --git a/src/vmm/src/devices/pci/pci_segment.rs b/src/vmm/src/devices/pci/pci_segment.rs index c37763eab3a..7deaa027f7b 100644 --- a/src/vmm/src/devices/pci/pci_segment.rs +++ b/src/vmm/src/devices/pci/pci_segment.rs @@ -80,14 +80,13 @@ impl PciSegment { PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, )?; - let mem32_allocator = vm.common.resource_allocator.mmio32_memory.clone(); - let mem64_allocator = vm.common.resource_allocator.mmio64_memory.clone(); + let resource_allocator = vm.resource_allocator(); - let start_of_mem32_area = mem32_allocator.lock().unwrap().base(); - let end_of_mem32_area = mem32_allocator.lock().unwrap().end(); + let start_of_mem32_area = resource_allocator.mmio32_memory.base(); + let end_of_mem32_area = resource_allocator.mmio32_memory.end(); - let start_of_mem64_area = mem64_allocator.lock().unwrap().base(); - let end_of_mem64_area = mem64_allocator.lock().unwrap().end(); + let start_of_mem64_area = resource_allocator.mmio64_memory.base(); + let end_of_mem64_area = resource_allocator.mmio64_memory.end(); let segment = PciSegment { id, diff --git a/src/vmm/src/vstate/resources.rs b/src/vmm/src/vstate/resources.rs index b0cb0ab625d..6571abf8cfc 100644 --- a/src/vmm/src/vstate/resources.rs +++ b/src/vmm/src/vstate/resources.rs @@ -2,7 +2,6 @@ // SPDX-License-Identifier: Apache-2.0 use std::convert::Infallible; -use std::sync::{Arc, Mutex}; use serde::{Deserialize, Serialize}; pub use vm_allocator::AllocPolicy; @@ -18,36 +17,44 @@ use crate::snapshot::Persist; /// * GSIs for legacy x86_64 devices /// * GSIs for MMIO devicecs /// * Memory allocations in the MMIO address space -#[derive(Debug)] +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct ResourceAllocator { /// Allocator for device interrupt lines - pub gsi_allocator: Arc>, + pub gsi_allocator: IdAllocator, /// Allocator for memory in the 32-bit MMIO address space - pub mmio32_memory: Arc>, + pub mmio32_memory: AddressAllocator, /// Allocator for memory in the 64-bit MMIO address space - pub mmio64_memory: Arc>, + pub mmio64_memory: AddressAllocator, /// Memory allocator for system data - pub system_memory: Arc>, + pub system_memory: AddressAllocator, +} + +impl Default for ResourceAllocator { + fn default() -> Self { + ResourceAllocator::new() + } } impl ResourceAllocator { /// Create a new resource allocator for Firecracker devices - pub fn new() -> Result { - Ok(Self { - gsi_allocator: Arc::new(Mutex::new(IdAllocator::new(arch::GSI_BASE, arch::GSI_MAX)?)), - mmio32_memory: Arc::new(Mutex::new(AddressAllocator::new( + pub fn new() -> Self { + // It is fine for us to unwrap the following since we know we are passing valid ranges for + // all allocators + Self { + gsi_allocator: IdAllocator::new(arch::GSI_BASE, arch::GSI_MAX).unwrap(), + mmio32_memory: AddressAllocator::new( arch::MEM_32BIT_DEVICES_START, arch::MEM_32BIT_DEVICES_SIZE, - )?)), - mmio64_memory: Arc::new(Mutex::new(AddressAllocator::new( + ) + .unwrap(), + mmio64_memory: AddressAllocator::new( arch::MEM_64BIT_DEVICES_START, arch::MEM_64BIT_DEVICES_SIZE, - )?)), - system_memory: Arc::new(Mutex::new(AddressAllocator::new( - arch::SYSTEM_MEM_START, - arch::SYSTEM_MEM_SIZE, - )?)), - }) + ) + .unwrap(), + system_memory: AddressAllocator::new(arch::SYSTEM_MEM_START, arch::SYSTEM_MEM_SIZE) + .unwrap(), + } } /// Allocate a number of GSIs @@ -55,17 +62,16 @@ impl ResourceAllocator { /// # Arguments /// /// * `gsi_count` - The number of GSIs to allocate - pub fn allocate_gsi(&self, gsi_count: u32) -> Result, vm_allocator::Error> { - let mut gsi_allocator = self.gsi_allocator.lock().expect("Poisoned lock"); + pub fn allocate_gsi(&mut self, gsi_count: u32) -> Result, vm_allocator::Error> { let mut gsis = Vec::with_capacity(gsi_count as usize); for _ in 0..gsi_count { - match gsi_allocator.allocate_id() { + match self.gsi_allocator.allocate_id() { Ok(gsi) => gsis.push(gsi), Err(err) => { // It is ok to unwrap here, we just allocated the GSI gsis.into_iter().for_each(|gsi| { - gsi_allocator.free_id(gsi).unwrap(); + self.gsi_allocator.free_id(gsi).unwrap(); }); return Err(err); } @@ -85,15 +91,13 @@ impl ResourceAllocator { /// * `alignment` - The alignment of the address of the first byte /// * `policy` - A [`vm_allocator::AllocPolicy`] variant for determining the allocation policy pub fn allocate_32bit_mmio_memory( - &self, + &mut self, size: u64, alignment: u64, policy: AllocPolicy, ) -> Result { Ok(self .mmio32_memory - .lock() - .expect("Poisoned lock") .allocate(size, alignment, policy)? .start()) } @@ -108,15 +112,13 @@ impl ResourceAllocator { /// * `alignment` - The alignment of the address of the first byte /// * `policy` - A [`vm_allocator::AllocPolicy`] variant for determining the allocation policy pub fn allocate_64bit_mmio_memory( - &self, + &mut self, size: u64, alignment: u64, policy: AllocPolicy, ) -> Result { Ok(self .mmio64_memory - .lock() - .expect("Poisoned lock") .allocate(size, alignment, policy)? .start()) } @@ -131,78 +133,32 @@ impl ResourceAllocator { /// * `alignment` - The alignment of the address of the first byte /// * `policy` - A [`vm_allocator::AllocPolicy`] variant for determining the allocation policy pub fn allocate_system_memory( - &self, + &mut self, size: u64, alignment: u64, policy: AllocPolicy, ) -> Result { Ok(self .system_memory - .lock() - .expect("Poisoned lock") .allocate(size, alignment, policy)? .start()) } } impl<'a> Persist<'a> for ResourceAllocator { - type State = ResourceAllocatorState; + type State = ResourceAllocator; type ConstructorArgs = (); type Error = Infallible; fn save(&self) -> Self::State { - ResourceAllocatorState { - gsi_allocator: self.gsi_allocator.clone(), - mmio32_memory: self.mmio32_memory.clone(), - mmio64_memory: self.mmio64_memory.clone(), - system_memory: self.system_memory.clone(), - } + self.clone() } fn restore( _constructor_args: Self::ConstructorArgs, state: &Self::State, ) -> std::result::Result { - Ok(ResourceAllocator { - gsi_allocator: state.gsi_allocator.clone(), - mmio32_memory: state.mmio32_memory.clone(), - mmio64_memory: state.mmio64_memory.clone(), - system_memory: state.system_memory.clone(), - }) - } -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -/// State of a ResourceAllocator -pub struct ResourceAllocatorState { - /// Allocator for device interrupt lines - pub gsi_allocator: Arc>, - /// Allocator for memory in the 32-bit MMIO address space - pub mmio32_memory: Arc>, - /// Allocator for memory in the 64-bit MMIO address space - pub mmio64_memory: Arc>, - /// Memory allocator for system data - pub system_memory: Arc>, -} - -impl Default for ResourceAllocatorState { - fn default() -> Self { - Self { - gsi_allocator: Arc::new(Mutex::new( - IdAllocator::new(arch::IRQ_BASE, arch::IRQ_MAX).unwrap(), - )), - mmio32_memory: Arc::new(Mutex::new( - AddressAllocator::new(arch::MEM_32BIT_DEVICES_START, arch::MEM_32BIT_DEVICES_SIZE) - .unwrap(), - )), - mmio64_memory: Arc::new(Mutex::new( - AddressAllocator::new(arch::MEM_64BIT_DEVICES_START, arch::MEM_64BIT_DEVICES_SIZE) - .unwrap(), - )), - system_memory: Arc::new(Mutex::new( - AddressAllocator::new(arch::SYSTEM_MEM_START, arch::SYSTEM_MEM_SIZE).unwrap(), - )), - } + Ok(state.clone()) } } @@ -210,7 +166,7 @@ impl Default for ResourceAllocatorState { mod tests { use vm_allocator::AllocPolicy; - use super::{ResourceAllocator, ResourceAllocatorState}; + use super::ResourceAllocator; use crate::arch::{self, GSI_BASE}; use crate::snapshot::{Persist, Snapshot}; @@ -218,7 +174,7 @@ mod tests { #[test] fn test_allocate_gsi() { - let allocator = ResourceAllocator::new().unwrap(); + let mut allocator = ResourceAllocator::new(); // asking for 0 IRQs should return us an empty vector assert_eq!(allocator.allocate_gsi(0), Ok(vec![])); // We cannot allocate more GSIs than available @@ -239,7 +195,7 @@ mod tests { // But we should be able to ask for 0 GSIs assert_eq!(allocator.allocate_gsi(0), Ok(vec![])); - let allocator = ResourceAllocator::new().unwrap(); + let mut allocator = ResourceAllocator::new(); // We should be able to allocate 1 GSI assert_eq!(allocator.allocate_gsi(1), Ok(vec![arch::GSI_BASE])); // We can't allocate MAX_IRQS any more @@ -258,18 +214,17 @@ mod tests { fn clone_allocator(allocator: &ResourceAllocator) -> ResourceAllocator { let mut buf = vec![0u8; 1024]; Snapshot::serialize(&mut buf.as_mut_slice(), &allocator.save()).unwrap(); - let restored_state: ResourceAllocatorState = - Snapshot::deserialize(&mut buf.as_slice()).unwrap(); + let restored_state: ResourceAllocator = Snapshot::deserialize(&mut buf.as_slice()).unwrap(); ResourceAllocator::restore((), &restored_state).unwrap() } #[test] fn test_save_restore() { - let allocator0 = ResourceAllocator::new().unwrap(); + let mut allocator0 = ResourceAllocator::new(); let gsi_0 = allocator0.allocate_gsi(1).unwrap()[0]; assert_eq!(gsi_0, GSI_BASE); - let allocator1 = clone_allocator(&allocator0); + let mut allocator1 = clone_allocator(&allocator0); let gsi_1 = allocator1.allocate_gsi(1).unwrap()[0]; assert_eq!(gsi_1, GSI_BASE + 1); let mmio32_mem = allocator1 @@ -285,7 +240,7 @@ mod tests { .unwrap(); assert_eq!(system_mem, arch::SYSTEM_MEM_START); - let allocator2 = clone_allocator(&allocator1); + let mut allocator2 = clone_allocator(&allocator1); allocator2 .allocate_32bit_mmio_memory(0x42, 1, AllocPolicy::ExactMatch(mmio32_mem)) .unwrap_err(); diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index ff29b1a2d1d..e60a1b0784a 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -10,7 +10,7 @@ use std::fs::OpenOptions; use std::io::Write; use std::path::Path; use std::sync::atomic::{AtomicBool, Ordering}; -use std::sync::{Arc, Mutex}; +use std::sync::{Arc, Mutex, MutexGuard}; #[cfg(target_arch = "x86_64")] use kvm_bindings::KVM_IRQCHIP_IOAPIC; @@ -247,7 +247,7 @@ pub struct VmCommon { /// Interrupts used by Vm's devices pub interrupts: Mutex>, /// Allocator for VM resources - pub resource_allocator: Arc, + pub resource_allocator: Mutex, /// MMIO bus pub mmio_bus: Arc, } @@ -324,7 +324,7 @@ impl Vm { max_memslots: kvm.max_nr_memslots(), guest_memory: GuestMemoryMmap::default(), interrupts: Mutex::new(HashMap::new()), - resource_allocator: Arc::new(ResourceAllocator::new()?), + resource_allocator: Mutex::new(ResourceAllocator::new()), mmio_bus: Arc::new(vm_device::Bus::new()), }) } @@ -410,6 +410,14 @@ impl Vm { &self.common.guest_memory } + /// Gets a mutable reference to this [`Vm`]'s [`ResourceAllocator`] object + pub fn resource_allocator(&self) -> MutexGuard { + self.common + .resource_allocator + .lock() + .expect("Poisoned lock") + } + /// Resets the KVM dirty bitmap for each of the guest's memory regions. pub fn reset_dirty_bitmap(&self) { self.guest_memory() @@ -587,8 +595,7 @@ impl Vm { debug!("Creating new MSI group with {count} vectors"); let mut irq_routes = HashMap::with_capacity(count as usize); for (gsi, i) in vm - .common - .resource_allocator + .resource_allocator() .allocate_gsi(count as u32)? .iter() .zip(0u32..) @@ -678,6 +685,8 @@ pub(crate) mod tests { use vm_memory::mmap::MmapRegionBuilder; use super::*; + #[cfg(target_arch = "x86_64")] + use crate::snapshot::Snapshot; use crate::test_utils::single_region_mem_raw; use crate::utils::mib_to_bytes; use crate::vstate::kvm::Kvm; @@ -1015,4 +1024,43 @@ pub(crate) mod tests { assert!(!new_vector.enabled.load(Ordering::Acquire)); } } + + #[cfg(target_arch = "x86_64")] + #[test] + fn test_restore_state_resource_allocator() { + use vm_allocator::AllocPolicy; + + let mut snapshot_data = vec![0u8; 10000]; + let (_, mut vm) = setup_vm_with_memory(0x1000); + vm.setup_irqchip().unwrap(); + + // Allocate a GSI and some memory and make sure they are still allocated after restore + let (gsi, range) = { + let mut resource_allocator = vm.resource_allocator(); + + let gsi = resource_allocator.allocate_gsi(1).unwrap()[0]; + let range = resource_allocator + .allocate_32bit_mmio_memory(1024, 1024, AllocPolicy::FirstMatch) + .unwrap(); + (gsi, range) + }; + + let state = vm.save_state().unwrap(); + Snapshot::serialize(&mut snapshot_data.as_mut_slice(), &state).unwrap(); + + let restored_state: VmState = Snapshot::deserialize(&mut snapshot_data.as_slice()).unwrap(); + vm.restore_state(&restored_state).unwrap(); + + let mut resource_allocator = vm.resource_allocator(); + let gsi_new = resource_allocator.allocate_gsi(1).unwrap()[0]; + assert_eq!(gsi + 1, gsi_new); + + resource_allocator + .allocate_32bit_mmio_memory(1024, 1024, AllocPolicy::ExactMatch(range)) + .unwrap_err(); + let range_new = resource_allocator + .allocate_32bit_mmio_memory(1024, 1024, AllocPolicy::FirstMatch) + .unwrap(); + assert_eq!(range + 1024, range_new); + } } From ea61d73fa7b04db95ce1984d715285c04851df6b Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 2 Jul 2025 16:17:48 +0200 Subject: [PATCH 55/99] fix(vsock): pass correct index when triggering interrupts We were confusing queue indexex with event indexes, when passing the index of the queue that needed to be triggered after handling events. Fix the logic to pass the correct index. This refactors a bit the code to signal the queues in each event handler method. With MMIO we don't need to signal each queue independently (one signal will cause the guest to scan all queues). With PCI though, we are using MSI-X, so we need to signal each queue independently. Also, change vsock functional integration tests to also run for PCI-enabled microVMs. Signed-off-by: Babis Chalios --- src/vmm/src/devices/virtio/vsock/device.rs | 2 +- .../src/devices/virtio/vsock/event_handler.rs | 78 ++++++++++--------- .../functional/test_vsock.py | 20 +++-- 3 files changed, 53 insertions(+), 47 deletions(-) diff --git a/src/vmm/src/devices/virtio/vsock/device.rs b/src/vmm/src/devices/virtio/vsock/device.rs index bef7fd0af4c..56426d1ea0f 100644 --- a/src/vmm/src/devices/virtio/vsock/device.rs +++ b/src/vmm/src/devices/virtio/vsock/device.rs @@ -378,7 +378,7 @@ where // `TRANSPORT_RESET_EVENT` event we sent during snapshot creation. if self.is_activated() { info!("kick vsock {}.", self.id()); - self.signal_used_queue(0).unwrap(); + self.signal_used_queue(EVQ_INDEX).unwrap(); } } } diff --git a/src/vmm/src/devices/virtio/vsock/event_handler.rs b/src/vmm/src/devices/virtio/vsock/event_handler.rs index b4445e298ae..e9e325c47e4 100755 --- a/src/vmm/src/devices/virtio/vsock/event_handler.rs +++ b/src/vmm/src/devices/virtio/vsock/event_handler.rs @@ -47,81 +47,82 @@ where const PROCESS_EVQ: u32 = 3; const PROCESS_NOTIFY_BACKEND: u32 = 4; - pub fn handle_rxq_event(&mut self, evset: EventSet) -> bool { + pub fn handle_rxq_event(&mut self, evset: EventSet) { if evset != EventSet::IN { warn!("vsock: rxq unexpected event {:?}", evset); METRICS.rx_queue_event_fails.inc(); - return false; + return; } - let mut raise_irq = false; if let Err(err) = self.queue_events[RXQ_INDEX].read() { error!("Failed to get vsock rx queue event: {:?}", err); METRICS.rx_queue_event_fails.inc(); } else if self.backend.has_pending_rx() { - // OK to unwrap: Only QueueError::InvalidAvailIdx is returned, and we explicitly - // want to panic on that one. - raise_irq |= self.process_rx().unwrap(); + if self.process_rx().unwrap() { + self.signal_used_queue(RXQ_INDEX) + .expect("vsock: Could not trigger device interrupt or RX queue"); + } METRICS.rx_queue_event_count.inc(); } - raise_irq } - pub fn handle_txq_event(&mut self, evset: EventSet) -> bool { + pub fn handle_txq_event(&mut self, evset: EventSet) { if evset != EventSet::IN { warn!("vsock: txq unexpected event {:?}", evset); METRICS.tx_queue_event_fails.inc(); - return false; + return; } - let mut raise_irq = false; if let Err(err) = self.queue_events[TXQ_INDEX].read() { error!("Failed to get vsock tx queue event: {:?}", err); METRICS.tx_queue_event_fails.inc(); } else { - // OK to unwrap: Only QueueError::InvalidAvailIdx is returned, and we explicitly - // want to panic on that one. - raise_irq |= self.process_tx().unwrap(); + if self.process_tx().unwrap() { + self.signal_used_queue(TXQ_INDEX) + .expect("vsock: Could not trigger device interrupt or TX queue"); + } METRICS.tx_queue_event_count.inc(); // The backend may have queued up responses to the packets we sent during // TX queue processing. If that happened, we need to fetch those responses // and place them into RX buffers. - if self.backend.has_pending_rx() { - raise_irq |= self.process_rx().unwrap(); + if self.backend.has_pending_rx() && self.process_rx().unwrap() { + self.signal_used_queue(RXQ_INDEX) + .expect("vsock: Could not trigger device interrupt or RX queue"); } } - raise_irq } - pub fn handle_evq_event(&mut self, evset: EventSet) -> bool { + pub fn handle_evq_event(&mut self, evset: EventSet) { if evset != EventSet::IN { warn!("vsock: evq unexpected event {:?}", evset); METRICS.ev_queue_event_fails.inc(); - return false; + return; } if let Err(err) = self.queue_events[EVQ_INDEX].read() { error!("Failed to consume vsock evq event: {:?}", err); METRICS.ev_queue_event_fails.inc(); } - false } /// Notify backend of new events. - pub fn notify_backend(&mut self, evset: EventSet) -> Result { + pub fn notify_backend(&mut self, evset: EventSet) -> Result<(), InvalidAvailIdx> { self.backend.notify(evset); // After the backend has been kicked, it might've freed up some resources, so we // can attempt to send it more data to process. // In particular, if `self.backend.send_pkt()` halted the TX queue processing (by // returning an error) at some point in the past, now is the time to try walking the // TX queue again. - // OK to unwrap: Only QueueError::InvalidAvailIdx is returned, and we explicitly - // want to panic on that one. - let mut raise_irq = self.process_tx()?; - if self.backend.has_pending_rx() { - raise_irq |= self.process_rx()?; + if self.process_tx()? { + self.signal_used_queue(TXQ_INDEX) + .expect("vsock: Could not trigger device interrupt or TX queue"); + } + if self.backend.has_pending_rx() && self.process_rx()? { + self.signal_used_queue(RXQ_INDEX) + .expect("vsock: Could not trigger device interrupt or RX queue"); } - Ok(raise_irq) + + Ok(()) } fn register_runtime_events(&self, ops: &mut EventOps) { @@ -189,19 +190,14 @@ where let evset = event.event_set(); if self.is_activated() { - let mut raise_irq = false; match source { Self::PROCESS_ACTIVATE => self.handle_activate_event(ops), - Self::PROCESS_RXQ => raise_irq = self.handle_rxq_event(evset), - Self::PROCESS_TXQ => raise_irq = self.handle_txq_event(evset), - Self::PROCESS_EVQ => raise_irq = self.handle_evq_event(evset), - Self::PROCESS_NOTIFY_BACKEND => raise_irq = self.notify_backend(evset).unwrap(), + Self::PROCESS_RXQ => self.handle_rxq_event(evset), + Self::PROCESS_TXQ => self.handle_txq_event(evset), + Self::PROCESS_EVQ => self.handle_evq_event(evset), + Self::PROCESS_NOTIFY_BACKEND => self.notify_backend(evset).unwrap(), _ => warn!("Unexpected vsock event received: {:?}", source), }; - if raise_irq { - self.signal_used_queue(source as usize) - .expect("vsock: Could not trigger device interrupt"); - } } else { warn!( "Vsock: The device is not yet activated. Spurious event received: {:?}", @@ -309,7 +305,9 @@ mod tests { let mut ctx = test_ctx.create_event_handler_context(); ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); - assert!(!ctx.device.handle_txq_event(EventSet::IN)); + let metric_before = METRICS.tx_queue_event_fails.count(); + ctx.device.handle_txq_event(EventSet::IN); + assert_eq!(metric_before + 1, METRICS.tx_queue_event_fails.count()); } } @@ -370,7 +368,9 @@ mod tests { let mut ctx = test_ctx.create_event_handler_context(); ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); ctx.device.backend.set_pending_rx(false); - assert!(!ctx.device.handle_rxq_event(EventSet::IN)); + let metric_before = METRICS.rx_queue_event_fails.count(); + ctx.device.handle_rxq_event(EventSet::IN); + assert_eq!(metric_before + 1, METRICS.rx_queue_event_fails.count()); } } @@ -381,7 +381,9 @@ mod tests { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); ctx.device.backend.set_pending_rx(false); - assert!(!ctx.device.handle_evq_event(EventSet::IN)); + let metric_before = METRICS.ev_queue_event_fails.count(); + ctx.device.handle_evq_event(EventSet::IN); + assert_eq!(metric_before + 1, METRICS.ev_queue_event_fails.count()); } } diff --git a/tests/integration_tests/functional/test_vsock.py b/tests/integration_tests/functional/test_vsock.py index dfa02510b37..5b6221c32a9 100644 --- a/tests/integration_tests/functional/test_vsock.py +++ b/tests/integration_tests/functional/test_vsock.py @@ -37,7 +37,7 @@ TEST_WORKER_COUNT = 10 -def test_vsock(uvm_plain_any, bin_vsock_path, test_fc_session_root_path): +def test_vsock(uvm_plain_any, pci_enabled, bin_vsock_path, test_fc_session_root_path): """ Test guest and host vsock initiated connections. @@ -45,7 +45,7 @@ def test_vsock(uvm_plain_any, bin_vsock_path, test_fc_session_root_path): """ vm = uvm_plain_any - vm.spawn() + vm.spawn(pci=pci_enabled) vm.basic_config() vm.add_net_iface() @@ -102,12 +102,12 @@ def negative_test_host_connections(vm, blob_path, blob_hash): validate_fc_metrics(metrics) -def test_vsock_epipe(uvm_plain, bin_vsock_path, test_fc_session_root_path): +def test_vsock_epipe(uvm_plain, pci_enabled, bin_vsock_path, test_fc_session_root_path): """ Vsock negative test to validate SIGPIPE/EPIPE handling. """ vm = uvm_plain - vm.spawn() + vm.spawn(pci=pci_enabled) vm.basic_config() vm.add_net_iface() vm.api.vsock.put(vsock_id="vsock0", guest_cid=3, uds_path=f"/{VSOCK_UDS_PATH}") @@ -129,7 +129,7 @@ def test_vsock_epipe(uvm_plain, bin_vsock_path, test_fc_session_root_path): def test_vsock_transport_reset_h2g( - uvm_nano, microvm_factory, bin_vsock_path, test_fc_session_root_path + uvm_plain, pci_enabled, microvm_factory, bin_vsock_path, test_fc_session_root_path ): """ Vsock transport reset test. @@ -146,7 +146,9 @@ def test_vsock_transport_reset_h2g( 6. Close VM -> Load VM from Snapshot -> check that vsock device is still working. """ - test_vm = uvm_nano + test_vm = uvm_plain + test_vm.spawn(pci=pci_enabled) + test_vm.basic_config(vcpu_count=2, mem_size_mib=256) test_vm.add_net_iface() test_vm.api.vsock.put(vsock_id="vsock0", guest_cid=3, uds_path=f"/{VSOCK_UDS_PATH}") test_vm.start() @@ -213,11 +215,13 @@ def test_vsock_transport_reset_h2g( validate_fc_metrics(metrics) -def test_vsock_transport_reset_g2h(uvm_nano, microvm_factory): +def test_vsock_transport_reset_g2h(uvm_plain, pci_enabled, microvm_factory): """ Vsock transport reset test. """ - test_vm = uvm_nano + test_vm = uvm_plain + test_vm.spawn(pci=pci_enabled) + test_vm.basic_config(vcpu_count=2, mem_size_mib=256) test_vm.add_net_iface() test_vm.api.vsock.put(vsock_id="vsock0", guest_cid=3, uds_path=f"/{VSOCK_UDS_PATH}") test_vm.start() From 129ffb6aa4d9c2b469d63dd9f0f14098afa3e2d1 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Tue, 8 Jul 2025 11:18:21 +0100 Subject: [PATCH 56/99] fix(pci): do not panic on invalid BDF during deserialization Correctly handle invalid Bdf by returning an error to the deserializer. This bug was caught by the fuzzer. Signed-off-by: Riccardo Mancini Signed-off-by: Babis Chalios --- src/pci/src/lib.rs | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/src/pci/src/lib.rs b/src/pci/src/lib.rs index f1dec5b126a..f2cf82f81fa 100644 --- a/src/pci/src/lib.rs +++ b/src/pci/src/lib.rs @@ -71,7 +71,7 @@ impl Visitor<'_> for PciBdfVisitor { where E: serde::de::Error, { - Ok(v.into()) + PciBdf::from_str(v).map_err(serde::de::Error::custom) } } @@ -176,24 +176,31 @@ impl Display for PciBdf { } } +/// Errors associated with parsing a BDF string. +#[derive(Debug, thiserror::Error, displaydoc::Display)] +pub enum PciBdfParseError { + /// Unable to parse bus/device/function number hex: {0} + InvalidHex(#[from] ParseIntError), + /// Invalid format: {0} (expected format: 0000:00:00.0) + InvalidFormat(String), +} + impl FromStr for PciBdf { - type Err = ParseIntError; + type Err = PciBdfParseError; fn from_str(s: &str) -> Result { let items: Vec<&str> = s.split('.').collect(); - assert_eq!(items.len(), 2); + if items.len() != 2 { + return Err(PciBdfParseError::InvalidFormat(s.to_string())); + } let function = u8::from_str_radix(items[1], 16)?; let items: Vec<&str> = items[0].split(':').collect(); - assert_eq!(items.len(), 3); + if items.len() != 3 { + return Err(PciBdfParseError::InvalidFormat(s.to_string())); + } let segment = u16::from_str_radix(items[0], 16)?; let bus = u8::from_str_radix(items[1], 16)?; let device = u8::from_str_radix(items[2], 16)?; Ok(PciBdf::new(segment, bus, device, function)) } } - -impl From<&str> for PciBdf { - fn from(bdf: &str) -> Self { - Self::from_str(bdf).unwrap() - } -} From 99ed078282efbbd9a26b730bfd4a83232b707297 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Tue, 8 Jul 2025 12:15:22 +0100 Subject: [PATCH 57/99] test(pci): add unit tests for Bdf Add some unit tests to cover PciBdf parsing, conversion, and (de)serialization. Signed-off-by: Riccardo Mancini Signed-off-by: Babis Chalios --- Cargo.lock | 10 ++++ src/pci/Cargo.toml | 3 + src/pci/src/lib.rs | 145 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 158 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index ecdfe8fefca..9fcde886d54 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1057,6 +1057,7 @@ dependencies = [ "libc", "log", "serde", + "serde_test", "thiserror 2.0.12", "vm-allocator", "vm-device", @@ -1338,6 +1339,15 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_test" +version = "1.0.177" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f901ee573cab6b3060453d2d5f0bae4e6d628c23c0a962ff9b5f1d7c8d4f1ed" +dependencies = [ + "serde", +] + [[package]] name = "shlex" version = "1.3.0" diff --git a/src/pci/Cargo.toml b/src/pci/Cargo.toml index d179854f391..a7ef102acfb 100644 --- a/src/pci/Cargo.toml +++ b/src/pci/Cargo.toml @@ -24,3 +24,6 @@ vm-memory = { version = "0.16.1", features = [ "backend-mmap", "backend-bitmap", ] } + +[dev-dependencies] +serde_test = "1.0.177" diff --git a/src/pci/src/lib.rs b/src/pci/src/lib.rs index f2cf82f81fa..1b9a3a99f76 100644 --- a/src/pci/src/lib.rs +++ b/src/pci/src/lib.rs @@ -204,3 +204,148 @@ impl FromStr for PciBdf { Ok(PciBdf::new(segment, bus, device, function)) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_pci_bdf_new() { + let bdf = PciBdf::new(0x1234, 0x56, 0x1f, 0x7); + assert_eq!(bdf.segment(), 0x1234); + assert_eq!(bdf.bus(), 0x56); + assert_eq!(bdf.device(), 0x1f); + assert_eq!(bdf.function(), 0x7); + } + + #[test] + fn test_pci_bdf_from_u32() { + let bdf = PciBdf::from(0x12345678); + assert_eq!(bdf.segment(), 0x1234); + assert_eq!(bdf.bus(), 0x56); + assert_eq!(bdf.device(), 0x0f); + assert_eq!(bdf.function(), 0x0); + } + + #[test] + fn test_pci_bdf_to_u32() { + let bdf = PciBdf::new(0x1234, 0x56, 0x1f, 0x7); + let val: u32 = bdf.into(); + assert_eq!(val, 0x123456ff); + } + + #[test] + fn test_pci_bdf_to_u16() { + let bdf = PciBdf::new(0x1234, 0x56, 0x1f, 0x7); + let val: u16 = bdf.into(); + assert_eq!(val, 0x56ff); + } + + #[test] + fn test_pci_bdf_from_str_valid() { + let bdf = PciBdf::from_str("1234:56:1f.7").unwrap(); + assert_eq!(bdf.segment(), 0x1234); + assert_eq!(bdf.bus(), 0x56); + assert_eq!(bdf.device(), 0x1f); + assert_eq!(bdf.function(), 0x7); + } + + #[test] + fn test_pci_bdf_from_str_zero() { + let bdf = PciBdf::from_str("0000:00:00.0").unwrap(); + assert_eq!(bdf.segment(), 0); + assert_eq!(bdf.bus(), 0); + assert_eq!(bdf.device(), 0); + assert_eq!(bdf.function(), 0); + } + + #[test] + fn test_pci_bdf_from_str_invalid_format() { + assert!(matches!( + PciBdf::from_str("invalid"), + Err(PciBdfParseError::InvalidFormat(_)) + )); + assert!(matches!( + PciBdf::from_str("1234:56"), + Err(PciBdfParseError::InvalidFormat(_)) + )); + assert!(matches!( + PciBdf::from_str("1234:56:78:9a.b"), + Err(PciBdfParseError::InvalidFormat(_)) + )); + } + + #[test] + fn test_pci_bdf_from_str_invalid_hex() { + assert!(matches!( + PciBdf::from_str("xxxx:00:00.0"), + Err(PciBdfParseError::InvalidHex(_)) + )); + assert!(matches!( + PciBdf::from_str("0000:xx:00.0"), + Err(PciBdfParseError::InvalidHex(_)) + )); + assert!(matches!( + PciBdf::from_str("0000:00:xx.0"), + Err(PciBdfParseError::InvalidHex(_)) + )); + assert!(matches!( + PciBdf::from_str("0000:00:00.x"), + Err(PciBdfParseError::InvalidHex(_)) + )); + } + + #[test] + fn test_pci_bdf_display() { + let bdf = PciBdf::new(0x1234, 0x56, 0x1f, 0x7); + assert_eq!(format!("{}", bdf), "1234:56:1f.7"); + } + + #[test] + fn test_pci_bdf_debug() { + let bdf = PciBdf::new(0x1234, 0x56, 0x1f, 0x7); + assert_eq!(format!("{:?}", bdf), "1234:56:1f.7"); + } + + #[test] + fn test_pci_bdf_partial_eq() { + let bdf1 = PciBdf::new(0x1234, 0x56, 0x1f, 0x7); + let bdf2 = PciBdf::new(0x1234, 0x56, 0x1f, 0x7); + let bdf3 = PciBdf::new(0x1234, 0x56, 0x1f, 0x6); + assert_eq!(bdf1, bdf2); + assert_ne!(bdf1, bdf3); + } + + #[test] + fn test_pci_bdf_partial_ord() { + let bdf1 = PciBdf::new(0x1234, 0x56, 0x1f, 0x6); + let bdf2 = PciBdf::new(0x1234, 0x56, 0x1f, 0x7); + assert!(bdf1 < bdf2); + } + + #[test] + fn test_pci_bdf_deserialize_ok() { + // Test deserializer + let visitor = PciBdfVisitor; + let result = visitor + .visit_str::("1234:56:1f.7") + .unwrap(); + assert_eq!(result, PciBdf::new(0x1234, 0x56, 0x1f, 0x7)); + } + + #[test] + fn test_pci_bdf_deserialize_invalid() { + // Test deserializer with invalid input returns error + let visitor = PciBdfVisitor; + assert!(visitor + .visit_str::("invalid") + .is_err()); + } + + #[test] + fn test_pci_bdf_serialize() { + // Test serializer using serde_test + let bdf = PciBdf::new(0x1234, 0x56, 0x1f, 0x7); + serde_test::assert_tokens(&bdf, &[serde_test::Token::Str("1234:56:1f.7")]); + } +} From ffa9693b03a48f9f0021f34a6e3816023a6fe6a2 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Wed, 9 Jul 2025 15:00:24 +0100 Subject: [PATCH 58/99] fix(vmm): fix patch of pci devices The vmm was only checking the mmio device manager for finding the device to update. Use the generic device manager instead. Also update unit tests that expect a specific string. Signed-off-by: Riccardo Mancini Signed-off-by: Babis Chalios --- src/vmm/src/device_manager/mod.rs | 35 +++++++++++++++++++ src/vmm/src/lib.rs | 14 ++++---- .../integration_tests/functional/test_api.py | 4 +-- 3 files changed, 43 insertions(+), 10 deletions(-) diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index cfc7fe44d79..c7f6acabfe1 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -81,6 +81,17 @@ pub enum AttachDeviceError { PciTransport(#[from] PciManagerError), } +#[derive(Debug, thiserror::Error, displaydoc::Display)] +/// Error while searching for a VirtIO device +pub enum FindDeviceError { + /// Device type is invalid + InvalidDeviceType, + /// Device not found + DeviceNotFound, + /// Internal Device error: {0} + InternalDeviceError(String), +} + #[derive(Debug)] /// A manager of all peripheral devices of Firecracker pub struct DeviceManager { @@ -342,6 +353,30 @@ impl DeviceManager { ) } } + + /// Run fn `f()` for the virtio device matching `virtio_type` and `id`. + pub fn with_virtio_device_with_id( + &self, + virtio_type: u32, + id: &str, + f: F, + ) -> Result<(), FindDeviceError> + where + T: VirtioDevice + 'static + Debug, + F: FnOnce(&mut T) -> Result<(), String>, + { + if let Some(device) = self.get_virtio_device(virtio_type, id) { + let mut dev = device.lock().expect("Poisoned lock"); + f(dev + .as_mut_any() + .downcast_mut::() + .ok_or(FindDeviceError::InvalidDeviceType)?) + .map_err(FindDeviceError::InternalDeviceError)?; + } else { + return Err(FindDeviceError::DeviceNotFound); + } + Ok(()) + } } #[derive(Debug, Default, Clone, Serialize, Deserialize)] diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index e53439373c7..09a44e3c7ad 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -252,6 +252,8 @@ pub enum VmmError { VmmObserverTeardown(vmm_sys_util::errno::Error), /// VMGenID error: {0} VMGenID(#[from] VmGenIdError), + /// Failed perform action on device: {0} + FindDeviceError(#[from] device_manager::FindDeviceError), } /// Shorthand type for KVM dirty page bitmap. @@ -535,13 +537,12 @@ impl Vmm { path_on_host: String, ) -> Result<(), VmmError> { self.device_manager - .mmio_devices .with_virtio_device_with_id(TYPE_BLOCK, drive_id, |block: &mut Block| { block .update_disk_image(path_on_host) .map_err(|err| err.to_string()) }) - .map_err(VmmError::MmioDeviceManager) + .map_err(VmmError::FindDeviceError) } /// Updates the rate limiter parameters for block device with `drive_id` id. @@ -552,23 +553,21 @@ impl Vmm { rl_ops: BucketUpdate, ) -> Result<(), VmmError> { self.device_manager - .mmio_devices .with_virtio_device_with_id(TYPE_BLOCK, drive_id, |block: &mut Block| { block .update_rate_limiter(rl_bytes, rl_ops) .map_err(|err| err.to_string()) }) - .map_err(VmmError::MmioDeviceManager) + .map_err(VmmError::FindDeviceError) } /// Updates the rate limiter parameters for block device with `drive_id` id. pub fn update_vhost_user_block_config(&mut self, drive_id: &str) -> Result<(), VmmError> { self.device_manager - .mmio_devices .with_virtio_device_with_id(TYPE_BLOCK, drive_id, |block: &mut Block| { block.update_config().map_err(|err| err.to_string()) }) - .map_err(VmmError::MmioDeviceManager) + .map_err(VmmError::FindDeviceError) } /// Updates the rate limiter parameters for net device with `net_id` id. @@ -581,12 +580,11 @@ impl Vmm { tx_ops: BucketUpdate, ) -> Result<(), VmmError> { self.device_manager - .mmio_devices .with_virtio_device_with_id(TYPE_NET, net_id, |net: &mut Net| { net.patch_rate_limiters(rx_bytes, rx_ops, tx_bytes, tx_ops); Ok(()) }) - .map_err(VmmError::MmioDeviceManager) + .map_err(VmmError::FindDeviceError) } /// Returns a reference to the balloon device if present. diff --git a/tests/integration_tests/functional/test_api.py b/tests/integration_tests/functional/test_api.py index 1cc5d3c6c61..55bb15d5eb4 100644 --- a/tests/integration_tests/functional/test_api.py +++ b/tests/integration_tests/functional/test_api.py @@ -772,7 +772,7 @@ def test_send_ctrl_alt_del(uvm_plain_any): def _drive_patch(test_microvm, io_engine): """Exercise drive patch test scenarios.""" # Patches without mandatory fields for virtio block are not allowed. - expected_msg = "Unable to patch the block device: MMIO Device manager error: Running method expected different backend. Please verify the request arguments" + expected_msg = "Running method expected different backend." with pytest.raises(RuntimeError, match=expected_msg): test_microvm.api.drive.patch(drive_id="scratch") @@ -814,7 +814,7 @@ def _drive_patch(test_microvm, io_engine): ) # Updates to `path_on_host` with an invalid path are not allowed. - expected_msg = f"Unable to patch the block device: MMIO Device manager error: Virtio backend error: Error manipulating the backing file: No such file or directory (os error 2) {drive_path} Please verify the request arguments" + expected_msg = f"Error manipulating the backing file: No such file or directory (os error 2) {drive_path}" with pytest.raises(RuntimeError, match=re.escape(expected_msg)): test_microvm.api.drive.patch(drive_id="scratch", path_on_host=drive_path) From 394e3a4e0d4595ca9231863de6ed8a831d32fb0a Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Wed, 9 Jul 2025 16:18:58 +0100 Subject: [PATCH 59/99] fix(balloon): fix balloon not found when pci is enabled The code managing the balloon logic is only looking at the mmio device manager. Make it use the generic device manager to find the device. Signed-off-by: Riccardo Mancini Signed-off-by: Babis Chalios --- src/vmm/src/lib.rs | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 09a44e3c7ad..7bb33411b7e 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -123,7 +123,6 @@ use std::time::Duration; use device_manager::DeviceManager; use devices::acpi::vmgenid::VmGenIdError; -use devices::virtio::device::VirtioDevice; use event_manager::{EventManager as BaseEventManager, EventOps, Events, MutEventSubscriber}; use seccomp::BpfProgram; use snapshot::Persist; @@ -329,20 +328,6 @@ impl Vmm { self.shutdown_exit_code } - /// Gets the specified bus device. - pub fn get_virtio_device( - &self, - device_type: u32, - device_id: &str, - ) -> Option>> { - let device = self - .device_manager - .mmio_devices - .get_virtio_device(device_type, device_id)?; - - Some(device.inner.lock().expect("Poisoned lock").device().clone()) - } - /// Starts the microVM vcpus. /// /// # Errors @@ -589,7 +574,10 @@ impl Vmm { /// Returns a reference to the balloon device if present. pub fn balloon_config(&self) -> Result { - if let Some(virtio_device) = self.get_virtio_device(TYPE_BALLOON, BALLOON_DEV_ID) { + if let Some(virtio_device) = self + .device_manager + .get_virtio_device(TYPE_BALLOON, BALLOON_DEV_ID) + { let config = virtio_device .lock() .expect("Poisoned lock") @@ -606,7 +594,10 @@ impl Vmm { /// Returns the latest balloon statistics if they are enabled. pub fn latest_balloon_stats(&self) -> Result { - if let Some(virtio_device) = self.get_virtio_device(TYPE_BALLOON, BALLOON_DEV_ID) { + if let Some(virtio_device) = self + .device_manager + .get_virtio_device(TYPE_BALLOON, BALLOON_DEV_ID) + { let latest_stats = virtio_device .lock() .expect("Poisoned lock") @@ -631,7 +622,10 @@ impl Vmm { return Err(BalloonError::TooManyPagesRequested); } - if let Some(virtio_device) = self.get_virtio_device(TYPE_BALLOON, BALLOON_DEV_ID) { + if let Some(virtio_device) = self + .device_manager + .get_virtio_device(TYPE_BALLOON, BALLOON_DEV_ID) + { { virtio_device .lock() @@ -653,7 +647,10 @@ impl Vmm { &mut self, stats_polling_interval_s: u16, ) -> Result<(), BalloonError> { - if let Some(virtio_device) = self.get_virtio_device(TYPE_BALLOON, BALLOON_DEV_ID) { + if let Some(virtio_device) = self + .device_manager + .get_virtio_device(TYPE_BALLOON, BALLOON_DEV_ID) + { { virtio_device .lock() From 48455bfc5825cbad6e72f975ed822c5f4a44103a Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Thu, 10 Jul 2025 11:54:43 +0100 Subject: [PATCH 60/99] fix(restore): fix net device rename of PCI devices The device rename wasn't working on PCI devices because the code only checked the MMIO state. Fix the bug by looking for the device to rename in both the mmio and pci states. Signed-off-by: Riccardo Mancini Signed-off-by: Babis Chalios --- src/vmm/src/persist.rs | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index 80b47f86076..b78d69fcdec 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -321,18 +321,23 @@ pub fn restore_from_snapshot( ) -> Result>, RestoreFromSnapshotError> { let mut microvm_state = snapshot_state_from_file(¶ms.snapshot_path)?; for entry in ¶ms.network_overrides { - let net_devices = &mut microvm_state.device_states.mmio_state.net_devices; - if let Some(device) = net_devices + microvm_state + .device_states + .mmio_state + .net_devices .iter_mut() - .find(|x| x.device_state.id == entry.iface_id) - { - device - .device_state - .tap_if_name - .clone_from(&entry.host_dev_name); - } else { - return Err(SnapshotStateFromFileError::UnknownNetworkDevice.into()); - } + .map(|device| &mut device.device_state) + .chain( + microvm_state + .device_states + .pci_state + .net_devices + .iter_mut() + .map(|device| &mut device.device_state), + ) + .find(|x| x.id == entry.iface_id) + .map(|device_state| device_state.tap_if_name.clone_from(&entry.host_dev_name)) + .ok_or(SnapshotStateFromFileError::UnknownNetworkDevice)?; } let track_dirty_pages = params.track_dirty_pages; From bab52cf44c42fa015986a29ebd9a9a926cd851db Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Tue, 8 Jul 2025 16:32:30 +0100 Subject: [PATCH 61/99] fix(msi): allocate GSI for MSI and "legacy IRQ" from different ranges Currently, we're limited to 24 GSI lines, which is too little for PCI devices. Keep the current ranges as "legacy GSI", and create a new range for "MSI GSI" that goes up to the kvm theoretical maximum of 4096 lines. Signed-off-by: Riccardo Mancini Signed-off-by: Babis Chalios --- src/vmm/src/arch/aarch64/gic/gicv2/mod.rs | 6 +- .../arch/aarch64/gic/gicv2/regs/dist_regs.rs | 6 +- src/vmm/src/arch/aarch64/gic/gicv3/mod.rs | 6 +- .../arch/aarch64/gic/gicv3/regs/dist_regs.rs | 6 +- src/vmm/src/arch/aarch64/layout.rs | 32 ++-- src/vmm/src/arch/mod.rs | 10 +- src/vmm/src/arch/x86_64/layout.rs | 26 +-- src/vmm/src/arch/x86_64/mptable.rs | 8 +- src/vmm/src/device_manager/acpi.rs | 2 +- src/vmm/src/device_manager/mmio.rs | 34 ++-- src/vmm/src/devices/acpi/vmgenid.rs | 2 +- src/vmm/src/vstate/resources.rs | 153 +++++++++++++----- src/vmm/src/vstate/vm.rs | 12 +- .../functional/test_max_devices.py | 61 ++++--- 14 files changed, 232 insertions(+), 132 deletions(-) diff --git a/src/vmm/src/arch/aarch64/gic/gicv2/mod.rs b/src/vmm/src/arch/aarch64/gic/gicv2/mod.rs index dfa2302d6be..01fd4b4d73d 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv2/mod.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv2/mod.rs @@ -135,9 +135,9 @@ impl GICv2 { // On arm there are 3 types of interrupts: SGI (0-15), PPI (16-31), SPI (32-1020). // SPIs are used to signal interrupts from various peripherals accessible across // the whole system so these are the ones that we increment when adding a new virtio device. - // KVM_DEV_ARM_VGIC_GRP_NR_IRQS sets the highest SPI number. Consequently, we will have a - // total of `super::layout::IRQ_MAX - 32` usable SPIs in our microVM. - let nr_irqs: u32 = super::layout::IRQ_MAX; + // KVM_DEV_ARM_VGIC_GRP_NR_IRQS sets the number of interrupts (SGI, PPI, and SPI). + // Consequently, we need to add 32 to the number of SPIs ("legacy GSI"). + let nr_irqs: u32 = crate::arch::GSI_LEGACY_NUM + super::layout::SPI_START; let nr_irqs_ptr = &nr_irqs as *const u32; Self::set_device_attribute( gic_device.device_fd(), diff --git a/src/vmm/src/arch/aarch64/gic/gicv2/regs/dist_regs.rs b/src/vmm/src/arch/aarch64/gic/gicv2/regs/dist_regs.rs index 21a404b302b..09a33a4a1ff 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv2/regs/dist_regs.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv2/regs/dist_regs.rs @@ -8,7 +8,7 @@ use kvm_ioctls::DeviceFd; use crate::arch::aarch64::gic::GicError; use crate::arch::aarch64::gic::regs::{GicRegState, MmioReg, SimpleReg, VgicRegEngine}; -use crate::arch::{IRQ_BASE, IRQ_MAX}; +use crate::arch::{GSI_LEGACY_NUM, SPI_START}; // Distributor registers as detailed at page 75 from // https://developer.arm.com/documentation/ihi0048/latest/. @@ -62,9 +62,9 @@ impl MmioReg for SharedIrqReg { // read-as-zero/write-ignore (RAZ/WI) policy. // The first part of a shared-irq register, the one corresponding to the // SGI and PPI IRQs (0-32) is RAZ/WI, so we skip it. - let start = self.offset + u64::from(IRQ_BASE) * u64::from(self.bits_per_irq) / 8; + let start = self.offset + u64::from(SPI_START) * u64::from(self.bits_per_irq) / 8; - let size_in_bits = u64::from(self.bits_per_irq) * u64::from(IRQ_MAX - IRQ_BASE); + let size_in_bits = u64::from(self.bits_per_irq) * u64::from(GSI_LEGACY_NUM); let mut size_in_bytes = size_in_bits / 8; if size_in_bits % 8 > 0 { size_in_bytes += 1; diff --git a/src/vmm/src/arch/aarch64/gic/gicv3/mod.rs b/src/vmm/src/arch/aarch64/gic/gicv3/mod.rs index 075687bc23e..5d131cf7b76 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv3/mod.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv3/mod.rs @@ -184,9 +184,9 @@ impl GICv3 { // On arm there are 3 types of interrupts: SGI (0-15), PPI (16-31), SPI (32-1020). // SPIs are used to signal interrupts from various peripherals accessible across // the whole system so these are the ones that we increment when adding a new virtio device. - // KVM_DEV_ARM_VGIC_GRP_NR_IRQS sets the highest SPI number. Consequently, we will have a - // total of `super::layout::IRQ_MAX - 32` usable SPIs in our microVM. - let nr_irqs: u32 = super::layout::IRQ_MAX; + // KVM_DEV_ARM_VGIC_GRP_NR_IRQS sets the number of interrupts (SGI, PPI, and SPI). + // Consequently, we need to add 32 to the number of SPIs ("legacy GSI"). + let nr_irqs: u32 = crate::arch::GSI_LEGACY_NUM + super::layout::SPI_START; let nr_irqs_ptr = &nr_irqs as *const u32; Self::set_device_attribute( gic_device.device_fd(), diff --git a/src/vmm/src/arch/aarch64/gic/gicv3/regs/dist_regs.rs b/src/vmm/src/arch/aarch64/gic/gicv3/regs/dist_regs.rs index 96c617dcc17..5a6eafb7003 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv3/regs/dist_regs.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv3/regs/dist_regs.rs @@ -8,7 +8,7 @@ use kvm_ioctls::DeviceFd; use crate::arch::aarch64::gic::GicError; use crate::arch::aarch64::gic::regs::{GicRegState, MmioReg, SimpleReg, VgicRegEngine}; -use crate::arch::{IRQ_BASE, IRQ_MAX}; +use crate::arch::{GSI_LEGACY_NUM, SPI_START}; // Distributor registers as detailed at page 456 from // https://static.docs.arm.com/ihi0069/c/IHI0069C_gic_architecture_specification.pdf. @@ -64,9 +64,9 @@ impl MmioReg for SharedIrqReg { // read-as-zero/write-ignore (RAZ/WI) policy. // The first part of a shared-irq register, the one corresponding to the // SGI and PPI IRQs (0-32) is RAZ/WI, so we skip it. - let start = self.offset + u64::from(IRQ_BASE) * u64::from(self.bits_per_irq) / 8; + let start = self.offset + u64::from(SPI_START) * u64::from(self.bits_per_irq) / 8; - let size_in_bits = u64::from(self.bits_per_irq) * u64::from(IRQ_MAX - IRQ_BASE); + let size_in_bits = u64::from(self.bits_per_irq) * u64::from(GSI_LEGACY_NUM); let mut size_in_bytes = size_in_bits / 8; if size_in_bits % 8 > 0 { size_in_bytes += 1; diff --git a/src/vmm/src/arch/aarch64/layout.rs b/src/vmm/src/arch/aarch64/layout.rs index 4b1f6ecda5b..c4937e43c92 100644 --- a/src/vmm/src/arch/aarch64/layout.rs +++ b/src/vmm/src/arch/aarch64/layout.rs @@ -76,19 +76,25 @@ pub const FDT_MAX_SIZE: usize = 0x20_0000; // * bigger than 32 // * less than 1023 and // * a multiple of 32. -/// The highest usable SPI on aarch64. -pub const IRQ_MAX: u32 = 128; - -/// First usable interrupt on aarch64. -pub const IRQ_BASE: u32 = 32; - -// The Linux kernel automatically shifts the GSI by 32 if it is an SPI, -// allowing us to start numbering from 0 instead of 32. -/// The first usable GSI on aarch64. -pub const GSI_BASE: u32 = 0; - -/// The maximum usable GSI on aarch64. -pub const GSI_MAX: u32 = IRQ_MAX - IRQ_BASE - 1; +// The first 32 SPIs are reserved, but KVM already shifts the gsi we +// pass, so we go from 0 to 95 for legacy gsis ("irq") and the remaining +// we use for MSI. +/// Offset of first SPI in the GIC +pub const SPI_START: u32 = 32; +/// Last possible SPI in the GIC (128 total SPIs) +pub const SPI_END: u32 = 127; +/// First usable GSI id on aarch64 (corresponds to SPI #32). +pub const GSI_LEGACY_START: u32 = 0; +/// There are 128 SPIs available, but the first 32 are reserved +pub const GSI_LEGACY_NUM: u32 = SPI_END - SPI_START + 1; +/// Last available GSI +pub const GSI_LEGACY_END: u32 = GSI_LEGACY_START + GSI_LEGACY_NUM - 1; +/// First GSI used by MSI after legacy GSI +pub const GSI_MSI_START: u32 = GSI_LEGACY_END + 1; +/// The highest available GSI in KVM (KVM_MAX_IRQ_ROUTES=4096) +pub const GSI_MSI_END: u32 = 4095; +/// Number of GSI available for MSI. +pub const GSI_MSI_NUM: u32 = GSI_MSI_END - GSI_MSI_START + 1; /// The start of the memory area reserved for MMIO 32-bit accesses. /// Below this address will reside the GIC, above this address will reside the MMIO devices. diff --git a/src/vmm/src/arch/mod.rs b/src/vmm/src/arch/mod.rs index fbeb9fa0ce0..6d33ce461b9 100644 --- a/src/vmm/src/arch/mod.rs +++ b/src/vmm/src/arch/mod.rs @@ -21,13 +21,14 @@ pub use aarch64::vm::{ArchVm, ArchVmError, VmState}; #[cfg(target_arch = "aarch64")] pub use aarch64::{ ConfigurationError, arch_memory_regions, configure_system_for_boot, get_kernel_start, - initrd_load_addr, layout::BOOT_DEVICE_MEM_START, layout::CMDLINE_MAX_SIZE, layout::GSI_BASE, - layout::GSI_MAX, layout::IRQ_BASE, layout::IRQ_MAX, layout::MEM_32BIT_DEVICES_SIZE, + initrd_load_addr, layout::BOOT_DEVICE_MEM_START, layout::CMDLINE_MAX_SIZE, + layout::GSI_LEGACY_END, layout::GSI_LEGACY_NUM, layout::GSI_LEGACY_START, layout::GSI_MSI_END, + layout::GSI_MSI_NUM, layout::GSI_MSI_START, layout::MEM_32BIT_DEVICES_SIZE, layout::MEM_32BIT_DEVICES_START, layout::MEM_64BIT_DEVICES_SIZE, layout::MEM_64BIT_DEVICES_START, layout::MMIO32_MEM_SIZE, layout::MMIO32_MEM_START, layout::PCI_MMCONFIG_SIZE, layout::PCI_MMCONFIG_START, layout::PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, layout::RTC_MEM_START, layout::SERIAL_MEM_START, - layout::SYSTEM_MEM_SIZE, layout::SYSTEM_MEM_START, load_kernel, + layout::SPI_START, layout::SYSTEM_MEM_SIZE, layout::SYSTEM_MEM_START, load_kernel, }; /// Module for x86_64 related functionality. @@ -45,7 +46,8 @@ pub use x86_64::vm::{ArchVm, ArchVmError, VmState}; pub use crate::arch::x86_64::{ ConfigurationError, arch_memory_regions, configure_system_for_boot, get_kernel_start, initrd_load_addr, layout::APIC_ADDR, layout::BOOT_DEVICE_MEM_START, layout::CMDLINE_MAX_SIZE, - layout::GSI_BASE, layout::GSI_MAX, layout::IOAPIC_ADDR, layout::IRQ_BASE, layout::IRQ_MAX, + layout::GSI_LEGACY_END, layout::GSI_LEGACY_NUM, layout::GSI_LEGACY_START, layout::GSI_MSI_END, + layout::GSI_MSI_NUM, layout::GSI_MSI_START, layout::IOAPIC_ADDR, layout::MEM_32BIT_DEVICES_SIZE, layout::MEM_32BIT_DEVICES_START, layout::MEM_64BIT_DEVICES_SIZE, layout::MEM_64BIT_DEVICES_START, layout::MMIO32_MEM_SIZE, layout::MMIO32_MEM_START, layout::PCI_MMCONFIG_SIZE, layout::PCI_MMCONFIG_START, diff --git a/src/vmm/src/arch/x86_64/layout.rs b/src/vmm/src/arch/x86_64/layout.rs index b7d5eb6dc5f..34ad343af2a 100644 --- a/src/vmm/src/arch/x86_64/layout.rs +++ b/src/vmm/src/arch/x86_64/layout.rs @@ -21,17 +21,21 @@ pub const CMDLINE_MAX_SIZE: usize = 2048; /// Start of the high memory. pub const HIMEM_START: u64 = 0x0010_0000; // 1 MB. -// Typically, on x86 systems 24 IRQs are used (0-23). -/// First usable IRQ ID for virtio device interrupts on x86_64. -pub const IRQ_BASE: u32 = 5; -/// Last usable IRQ ID for virtio device interrupts on x86_64. -pub const IRQ_MAX: u32 = 23; - -/// The first usable GSI on x86_64 is the same as the first usable IRQ ID. -pub const GSI_BASE: u32 = IRQ_BASE; - -/// The maximum usable GSI on x86_64 is the same as the last usable IRQ ID. -pub const GSI_MAX: u32 = IRQ_MAX; +// Typically, on x86 systems 24 IRQs are used for legacy devices (0-23). +// However, the first 5 are reserved. +// We allocate the remaining GSIs to MSIs. +/// First usable GSI for legacy interrupts (IRQ) on x86_64. +pub const GSI_LEGACY_START: u32 = 5; +/// Last usable GSI for legacy interrupts (IRQ) on x86_64. +pub const GSI_LEGACY_END: u32 = 23; +/// Number of legacy GSI (IRQ) available on x86_64. +pub const GSI_LEGACY_NUM: u32 = GSI_LEGACY_END - GSI_LEGACY_START + 1; +/// First GSI used by MSI after legacy GSI. +pub const GSI_MSI_START: u32 = GSI_LEGACY_END + 1; +/// The highest available GSI in KVM (KVM_MAX_IRQ_ROUTES=4096). +pub const GSI_MSI_END: u32 = 4095; +/// Number of GSI available for MSI. +pub const GSI_MSI_NUM: u32 = GSI_MSI_END - GSI_MSI_START + 1; /// Address for the TSS setup. pub const KVM_TSS_ADDRESS: u64 = 0xfffb_d000; diff --git a/src/vmm/src/arch/x86_64/mptable.rs b/src/vmm/src/arch/x86_64/mptable.rs index a4b1e2fa632..99fb202c8d8 100644 --- a/src/vmm/src/arch/x86_64/mptable.rs +++ b/src/vmm/src/arch/x86_64/mptable.rs @@ -13,7 +13,7 @@ use libc::c_char; use log::debug; use vm_allocator::AllocPolicy; -use crate::arch::IRQ_MAX; +use crate::arch::GSI_LEGACY_END; use crate::arch::x86_64::generated::mpspec; use crate::vstate::memory::{ Address, ByteValued, Bytes, GuestAddress, GuestMemory, GuestMemoryMmap, @@ -109,7 +109,7 @@ fn compute_mp_size(num_cpus: u8) -> usize { + mem::size_of::() * (num_cpus as usize) + mem::size_of::() + mem::size_of::() - + mem::size_of::() * (IRQ_MAX as usize + 1) + + mem::size_of::() * (GSI_LEGACY_END as usize + 1) + mem::size_of::() * 2 } @@ -225,7 +225,7 @@ pub fn setup_mptable( mp_num_entries += 1; } // Per kvm_setup_default_irq_routing() in kernel - for i in 0..=u8::try_from(IRQ_MAX).map_err(|_| MptableError::TooManyIrqs)? { + for i in 0..=u8::try_from(GSI_LEGACY_END).map_err(|_| MptableError::TooManyIrqs)? { let size = mem::size_of::() as u64; let mpc_intsrc = mpspec::mpc_intsrc { type_: mpspec::MP_INTSRC.try_into().unwrap(), @@ -406,7 +406,7 @@ mod tests { // ISA Bus + 1 // IRQ - + u16::try_from(IRQ_MAX).unwrap() + 1 + + u16::try_from(GSI_LEGACY_END).unwrap() + 1 // Interrupt source ExtINT + 1 // Interrupt source NMI diff --git a/src/vmm/src/device_manager/acpi.rs b/src/vmm/src/device_manager/acpi.rs index 3f0af80c7aa..874443fcc5c 100644 --- a/src/vmm/src/device_manager/acpi.rs +++ b/src/vmm/src/device_manager/acpi.rs @@ -64,7 +64,7 @@ impl Aml for ACPIDeviceManager { // We know that the maximum IRQ number fits in a u8. We have up to // 32 IRQs in x86 and up to 128 in // ARM (look into - // `vmm::crate::arch::layout::IRQ_MAX`) + // `vmm::crate::arch::layout::GSI_LEGACY_END`) #[allow(clippy::cast_possible_truncation)] &aml::Equal::new(&aml::Arg(0), &(vmgenid.gsi as u8)), vec![&aml::Notify::new( diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index d0c116ce20d..0cbc35535c9 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -82,12 +82,12 @@ fn add_virtio_aml( dsdt_data: &mut Vec, addr: u64, len: u64, - irq: u32, + gsi: u32, ) -> Result<(), aml::AmlError> { - let dev_id = irq - crate::arch::GSI_BASE; + let dev_id = gsi - crate::arch::GSI_LEGACY_START; debug!( - "acpi: Building AML for VirtIO device _SB_.V{:03}. memory range: {:#010x}:{} irq: {}", - dev_id, addr, len, irq + "acpi: Building AML for VirtIO device _SB_.V{:03}. memory range: {:#010x}:{} gsi: {}", + dev_id, addr, len, gsi ); aml::Device::new( format!("V{:03}", dev_id).as_str().try_into()?, @@ -103,7 +103,7 @@ fn add_virtio_aml( addr.try_into().unwrap(), len.try_into().unwrap(), ), - &aml::Interrupt::new(true, true, false, false, irq), + &aml::Interrupt::new(true, true, false, false, gsi), ]), )?, ], @@ -156,9 +156,9 @@ impl MMIODeviceManager { resource_allocator: &mut ResourceAllocator, irq_count: u32, ) -> Result { - let irq = match resource_allocator.allocate_gsi(irq_count)?[..] { + let gsi = match resource_allocator.allocate_gsi_legacy(irq_count)?[..] { [] => None, - [irq] => Some(irq), + [gsi] => Some(gsi), _ => return Err(MmioError::InvalidIrqConfig), }; @@ -169,7 +169,7 @@ impl MMIODeviceManager { AllocPolicy::FirstMatch, )?, len: MMIO_LEN, - irq, + irq: gsi, }; Ok(device_info) } @@ -183,7 +183,7 @@ impl MMIODeviceManager { ) -> Result<(), MmioError> { // Our virtio devices are currently hardcoded to use a single IRQ. // Validate that requirement. - let irq = device.resources.irq.ok_or(MmioError::InvalidIrqConfig)?; + let gsi = device.resources.irq.ok_or(MmioError::InvalidIrqConfig)?; let identifier; { let mmio_device = device.inner.lock().expect("Poisoned lock"); @@ -197,7 +197,7 @@ impl MMIODeviceManager { .register_ioevent(queue_evt, &io_addr, u32::try_from(i).unwrap()) .map_err(MmioError::RegisterIoEvent)?; } - vm.register_irq(&mmio_device.interrupt.irq_evt, irq) + vm.register_irq(&mmio_device.interrupt.irq_evt, gsi) .map_err(MmioError::RegisterIrqFd)?; } @@ -276,7 +276,7 @@ impl MMIODeviceManager { let device_info = if let Some(device_info) = device_info_opt { device_info } else { - let gsi = vm.resource_allocator().allocate_gsi(1)?; + let gsi = vm.resource_allocator().allocate_gsi_legacy(1)?; MMIODeviceInfo { addr: SERIAL_MEM_START, len: MMIO_LEN, @@ -335,7 +335,7 @@ impl MMIODeviceManager { let device_info = if let Some(device_info) = device_info_opt { device_info } else { - let gsi = vm.resource_allocator().allocate_gsi(1)?; + let gsi = vm.resource_allocator().allocate_gsi_legacy(1)?; MMIODeviceInfo { addr: RTC_MEM_START, len: MMIO_LEN, @@ -612,7 +612,7 @@ pub(crate) mod tests { let dev = device_manager.get_virtio_device(0, "dummy").unwrap(); assert_eq!(dev.resources.addr, arch::MEM_32BIT_DEVICES_START); assert_eq!(dev.resources.len, MMIO_LEN); - assert_eq!(dev.resources.irq, Some(arch::GSI_BASE)); + assert_eq!(dev.resources.irq, Some(arch::GSI_LEGACY_START)); device_manager .for_each_virtio_device(|virtio_type, device_id, mmio_device| { @@ -620,7 +620,7 @@ pub(crate) mod tests { assert_eq!(device_id, "dummy"); assert_eq!(mmio_device.resources.addr, arch::MEM_32BIT_DEVICES_START); assert_eq!(mmio_device.resources.len, MMIO_LEN); - assert_eq!(mmio_device.resources.irq, Some(arch::GSI_BASE)); + assert_eq!(mmio_device.resources.irq, Some(arch::GSI_LEGACY_START)); Ok::<(), ()>(()) }) .unwrap(); @@ -643,7 +643,7 @@ pub(crate) mod tests { #[cfg(target_arch = "aarch64")] vm.setup_irqchip(1).unwrap(); - for _i in crate::arch::GSI_BASE..=crate::arch::GSI_MAX { + for _i in crate::arch::GSI_LEGACY_START..=crate::arch::GSI_LEGACY_END { device_manager .register_virtio_test_device( &vm, @@ -711,7 +711,7 @@ pub(crate) mod tests { .addr ); assert_eq!( - crate::arch::GSI_BASE, + crate::arch::GSI_LEGACY_START, device_manager.virtio_devices[&(type_id, id)] .resources .irq @@ -762,7 +762,7 @@ pub(crate) mod tests { let device_info = device_manager .allocate_mmio_resources(&mut resource_allocator, 1) .unwrap(); - assert_eq!(device_info.irq.unwrap(), crate::arch::GSI_BASE); + assert_eq!(device_info.irq.unwrap(), crate::arch::GSI_LEGACY_START); } #[test] diff --git a/src/vmm/src/devices/acpi/vmgenid.rs b/src/vmm/src/devices/acpi/vmgenid.rs index 6d096007193..8dc89289c98 100644 --- a/src/vmm/src/devices/acpi/vmgenid.rs +++ b/src/vmm/src/devices/acpi/vmgenid.rs @@ -88,7 +88,7 @@ impl VmGenId { mem: &GuestMemoryMmap, resource_allocator: &mut ResourceAllocator, ) -> Result { - let gsi = resource_allocator.allocate_gsi(1)?; + let gsi = resource_allocator.allocate_gsi_legacy(1)?; // The generation ID needs to live in an 8-byte aligned buffer let addr = resource_allocator.allocate_system_memory( VMGENID_MEM_SIZE, diff --git a/src/vmm/src/vstate/resources.rs b/src/vmm/src/vstate/resources.rs index 6571abf8cfc..545b211699f 100644 --- a/src/vmm/src/vstate/resources.rs +++ b/src/vmm/src/vstate/resources.rs @@ -10,6 +10,29 @@ use vm_allocator::{AddressAllocator, IdAllocator}; use crate::arch; use crate::snapshot::Persist; +/// Helper function to allocate many ids from an id allocator +fn allocate_many_ids( + id_allocator: &mut IdAllocator, + count: u32, +) -> Result, vm_allocator::Error> { + let mut ids = Vec::with_capacity(count as usize); + + for _ in 0..count { + match id_allocator.allocate_id() { + Ok(id) => ids.push(id), + Err(err) => { + // It is ok to unwrap here, we just allocated the GSI + ids.into_iter().for_each(|id| { + id_allocator.free_id(id).unwrap(); + }); + return Err(err); + } + } + } + + Ok(ids) +} + /// A resource manager for (de)allocating interrupt lines (GSIs) and guest memory /// /// At the moment, we support: @@ -19,8 +42,10 @@ use crate::snapshot::Persist; /// * Memory allocations in the MMIO address space #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ResourceAllocator { - /// Allocator for device interrupt lines - pub gsi_allocator: IdAllocator, + /// Allocator for legacy device interrupt lines + pub gsi_legacy_allocator: IdAllocator, + /// Allocator for PCI device GSIs + pub gsi_msi_allocator: IdAllocator, /// Allocator for memory in the 32-bit MMIO address space pub mmio32_memory: AddressAllocator, /// Allocator for memory in the 64-bit MMIO address space @@ -41,7 +66,9 @@ impl ResourceAllocator { // It is fine for us to unwrap the following since we know we are passing valid ranges for // all allocators Self { - gsi_allocator: IdAllocator::new(arch::GSI_BASE, arch::GSI_MAX).unwrap(), + gsi_legacy_allocator: IdAllocator::new(arch::GSI_LEGACY_START, arch::GSI_LEGACY_END) + .unwrap(), + gsi_msi_allocator: IdAllocator::new(arch::GSI_MSI_START, arch::GSI_MSI_END).unwrap(), mmio32_memory: AddressAllocator::new( arch::MEM_32BIT_DEVICES_START, arch::MEM_32BIT_DEVICES_SIZE, @@ -57,28 +84,22 @@ impl ResourceAllocator { } } - /// Allocate a number of GSIs + /// Allocate a number of legacy GSIs /// /// # Arguments /// - /// * `gsi_count` - The number of GSIs to allocate - pub fn allocate_gsi(&mut self, gsi_count: u32) -> Result, vm_allocator::Error> { - let mut gsis = Vec::with_capacity(gsi_count as usize); - - for _ in 0..gsi_count { - match self.gsi_allocator.allocate_id() { - Ok(gsi) => gsis.push(gsi), - Err(err) => { - // It is ok to unwrap here, we just allocated the GSI - gsis.into_iter().for_each(|gsi| { - self.gsi_allocator.free_id(gsi).unwrap(); - }); - return Err(err); - } - } - } + /// * `gsi_count` - The number of legacy GSIs to allocate + pub fn allocate_gsi_legacy(&mut self, gsi_count: u32) -> Result, vm_allocator::Error> { + allocate_many_ids(&mut self.gsi_legacy_allocator, gsi_count) + } - Ok(gsis) + /// Allocate a number of GSIs for MSI + /// + /// # Arguments + /// + /// * `gsi_count` - The number of GSIs to allocate + pub fn allocate_gsi_msi(&mut self, gsi_count: u32) -> Result, vm_allocator::Error> { + allocate_many_ids(&mut self.gsi_msi_allocator, gsi_count) } /// Allocate a memory range in 32-bit MMIO address space @@ -167,47 +188,93 @@ mod tests { use vm_allocator::AllocPolicy; use super::ResourceAllocator; - use crate::arch::{self, GSI_BASE}; + use crate::arch::{self, GSI_LEGACY_NUM, GSI_LEGACY_START, GSI_MSI_NUM, GSI_MSI_START}; use crate::snapshot::{Persist, Snapshot}; - const MAX_IRQS: u32 = arch::GSI_MAX - arch::GSI_BASE + 1; + #[test] + fn test_allocate_irq() { + let mut allocator = ResourceAllocator::new(); + // asking for 0 IRQs should return us an empty vector + assert_eq!(allocator.allocate_gsi_legacy(0), Ok(vec![])); + // We cannot allocate more GSIs than available + assert_eq!( + allocator.allocate_gsi_legacy(GSI_LEGACY_NUM + 1), + Err(vm_allocator::Error::ResourceNotAvailable) + ); + // But allocating all of them at once should work + assert_eq!( + allocator.allocate_gsi_legacy(GSI_LEGACY_NUM), + Ok((arch::GSI_LEGACY_START..=arch::GSI_LEGACY_END).collect::>()) + ); + // And now we ran out of GSIs + assert_eq!( + allocator.allocate_gsi_legacy(1), + Err(vm_allocator::Error::ResourceNotAvailable) + ); + // But we should be able to ask for 0 GSIs + assert_eq!(allocator.allocate_gsi_legacy(0), Ok(vec![])); + + let mut allocator = ResourceAllocator::new(); + // We should be able to allocate 1 GSI + assert_eq!( + allocator.allocate_gsi_legacy(1), + Ok(vec![arch::GSI_LEGACY_START]) + ); + // We can't allocate MAX_IRQS any more + assert_eq!( + allocator.allocate_gsi_legacy(GSI_LEGACY_NUM), + Err(vm_allocator::Error::ResourceNotAvailable) + ); + // We can allocate another one and it should be the second available + assert_eq!( + allocator.allocate_gsi_legacy(1), + Ok(vec![arch::GSI_LEGACY_START + 1]) + ); + // Let's allocate the rest in a loop + for i in arch::GSI_LEGACY_START + 2..=arch::GSI_LEGACY_END { + assert_eq!(allocator.allocate_gsi_legacy(1), Ok(vec![i])); + } + } #[test] fn test_allocate_gsi() { let mut allocator = ResourceAllocator::new(); // asking for 0 IRQs should return us an empty vector - assert_eq!(allocator.allocate_gsi(0), Ok(vec![])); + assert_eq!(allocator.allocate_gsi_msi(0), Ok(vec![])); // We cannot allocate more GSIs than available assert_eq!( - allocator.allocate_gsi(MAX_IRQS + 1), + allocator.allocate_gsi_msi(GSI_MSI_NUM + 1), Err(vm_allocator::Error::ResourceNotAvailable) ); // But allocating all of them at once should work assert_eq!( - allocator.allocate_gsi(MAX_IRQS), - Ok((arch::GSI_BASE..=arch::GSI_MAX).collect::>()) + allocator.allocate_gsi_msi(GSI_MSI_NUM), + Ok((arch::GSI_MSI_START..=arch::GSI_MSI_END).collect::>()) ); // And now we ran out of GSIs assert_eq!( - allocator.allocate_gsi(1), + allocator.allocate_gsi_msi(1), Err(vm_allocator::Error::ResourceNotAvailable) ); // But we should be able to ask for 0 GSIs - assert_eq!(allocator.allocate_gsi(0), Ok(vec![])); + assert_eq!(allocator.allocate_gsi_msi(0), Ok(vec![])); let mut allocator = ResourceAllocator::new(); // We should be able to allocate 1 GSI - assert_eq!(allocator.allocate_gsi(1), Ok(vec![arch::GSI_BASE])); + assert_eq!(allocator.allocate_gsi_msi(1), Ok(vec![arch::GSI_MSI_START])); // We can't allocate MAX_IRQS any more assert_eq!( - allocator.allocate_gsi(MAX_IRQS), + allocator.allocate_gsi_msi(GSI_MSI_NUM), Err(vm_allocator::Error::ResourceNotAvailable) ); // We can allocate another one and it should be the second available - assert_eq!(allocator.allocate_gsi(1), Ok(vec![arch::GSI_BASE + 1])); + assert_eq!( + allocator.allocate_gsi_msi(1), + Ok(vec![arch::GSI_MSI_START + 1]) + ); // Let's allocate the rest in a loop - for i in arch::GSI_BASE + 2..=arch::GSI_MAX { - assert_eq!(allocator.allocate_gsi(1), Ok(vec![i])); + for i in arch::GSI_MSI_START + 2..=arch::GSI_MSI_END { + assert_eq!(allocator.allocate_gsi_msi(1), Ok(vec![i])); } } @@ -221,12 +288,16 @@ mod tests { #[test] fn test_save_restore() { let mut allocator0 = ResourceAllocator::new(); - let gsi_0 = allocator0.allocate_gsi(1).unwrap()[0]; - assert_eq!(gsi_0, GSI_BASE); + let irq_0 = allocator0.allocate_gsi_legacy(1).unwrap()[0]; + assert_eq!(irq_0, GSI_LEGACY_START); + let gsi_0 = allocator0.allocate_gsi_msi(1).unwrap()[0]; + assert_eq!(gsi_0, GSI_MSI_START); let mut allocator1 = clone_allocator(&allocator0); - let gsi_1 = allocator1.allocate_gsi(1).unwrap()[0]; - assert_eq!(gsi_1, GSI_BASE + 1); + let irq_1 = allocator1.allocate_gsi_legacy(1).unwrap()[0]; + assert_eq!(irq_1, GSI_LEGACY_START + 1); + let gsi_1 = allocator1.allocate_gsi_msi(1).unwrap()[0]; + assert_eq!(gsi_1, GSI_MSI_START + 1); let mmio32_mem = allocator1 .allocate_32bit_mmio_memory(0x42, 1, AllocPolicy::FirstMatch) .unwrap(); @@ -251,8 +322,10 @@ mod tests { .allocate_system_memory(0x42, 1, AllocPolicy::ExactMatch(system_mem)) .unwrap_err(); - let gsi_2 = allocator2.allocate_gsi(1).unwrap()[0]; - assert_eq!(gsi_2, GSI_BASE + 2); + let irq_2 = allocator2.allocate_gsi_legacy(1).unwrap()[0]; + assert_eq!(irq_2, GSI_LEGACY_START + 2); + let gsi_2 = allocator2.allocate_gsi_msi(1).unwrap()[0]; + assert_eq!(gsi_2, GSI_MSI_START + 2); let mmio32_mem = allocator1 .allocate_32bit_mmio_memory(0x42, 1, AllocPolicy::FirstMatch) .unwrap(); diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index e60a1b0784a..f4a18484cbc 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -596,7 +596,7 @@ impl Vm { let mut irq_routes = HashMap::with_capacity(count as usize); for (gsi, i) in vm .resource_allocator() - .allocate_gsi(count as u32)? + .allocate_gsi_msi(count as u32)? .iter() .zip(0u32..) { @@ -926,7 +926,7 @@ pub(crate) mod tests { } for i in 0..4 { - let gsi = crate::arch::GSI_BASE + i; + let gsi = crate::arch::GSI_MSI_START + i; let interrupts = vm.common.interrupts.lock().unwrap(); let kvm_route = interrupts.get(&gsi).unwrap(); assert!(kvm_route.masked); @@ -943,7 +943,7 @@ pub(crate) mod tests { // Simply enabling the vectors should not update the registered IRQ routes msix_group.enable().unwrap(); for i in 0..4 { - let gsi = crate::arch::GSI_BASE + i; + let gsi = crate::arch::GSI_MSI_START + i; let interrupts = vm.common.interrupts.lock().unwrap(); let kvm_route = interrupts.get(&gsi).unwrap(); assert!(kvm_route.masked); @@ -963,7 +963,7 @@ pub(crate) mod tests { .update(0, InterruptSourceConfig::MsiIrq(config), false, true) .unwrap(); for i in 0..4 { - let gsi = crate::arch::GSI_BASE + i; + let gsi = crate::arch::GSI_MSI_START + i; let interrupts = vm.common.interrupts.lock().unwrap(); let kvm_route = interrupts.get(&gsi).unwrap(); assert_eq!(kvm_route.masked, i != 0); @@ -1038,7 +1038,7 @@ pub(crate) mod tests { let (gsi, range) = { let mut resource_allocator = vm.resource_allocator(); - let gsi = resource_allocator.allocate_gsi(1).unwrap()[0]; + let gsi = resource_allocator.allocate_gsi_msi(1).unwrap()[0]; let range = resource_allocator .allocate_32bit_mmio_memory(1024, 1024, AllocPolicy::FirstMatch) .unwrap(); @@ -1052,7 +1052,7 @@ pub(crate) mod tests { vm.restore_state(&restored_state).unwrap(); let mut resource_allocator = vm.resource_allocator(); - let gsi_new = resource_allocator.allocate_gsi(1).unwrap()[0]; + let gsi_new = resource_allocator.allocate_gsi_msi(1).unwrap()[0]; assert_eq!(gsi + 1, gsi_new); resource_allocator diff --git a/tests/integration_tests/functional/test_max_devices.py b/tests/integration_tests/functional/test_max_devices.py index 85cf2f1399c..3c52127792d 100644 --- a/tests/integration_tests/functional/test_max_devices.py +++ b/tests/integration_tests/functional/test_max_devices.py @@ -6,63 +6,78 @@ import pytest -# On x86_64, IRQs are available from 5 to 23. We always use one IRQ for VMGenID -# device, so the maximum number of devices supported at the same time is 18. -# On aarch64, IRQs are available from 32 to 127. We always use one IRQ each for -# the VMGenID and RTC devices, so the maximum number of devices supported -# at the same time is 94. -MAX_DEVICES_ATTACHED = {"x86_64": 18, "aarch64": 94}.get(platform.machine()) - - -def test_attach_maximum_devices(microvm_factory, guest_kernel, rootfs): +def max_devices(uvm): + """ + Returns the maximum number of devices supported by the platform. + """ + if uvm.pci_enabled: + # On PCI, we only have one bus, so 32 minus the bus itself + return 31 + + match platform.machine(): + case "aarch64": + # On aarch64, IRQs are available from 32 to 127. We always use one IRQ each for + # the VMGenID and RTC devices, so the maximum number of devices supported + # at the same time is 94. + return 94 + case "x86_64": + # IRQs are available from 5 to 23. We always use one IRQ for VMGenID device, so + # the maximum number of devices supported at the same time is 18. + return 18 + case _: + raise ValueError("Unknown platform") + + +def test_attach_maximum_devices(microvm_factory, guest_kernel, rootfs, pci_enabled): """ Test attaching maximum number of devices to the microVM. """ - if MAX_DEVICES_ATTACHED is None: - pytest.skip("Unsupported platform for this test.") - test_microvm = microvm_factory.build(guest_kernel, rootfs, monitor_memory=False) - test_microvm.spawn() + test_microvm.spawn(pci=pci_enabled) # The default 256mib is not enough for 94 ssh connections on aarch64. test_microvm.basic_config(mem_size_mib=512) + max_devices_attached = max_devices(test_microvm) # Add (`MAX_DEVICES_ATTACHED` - 1) devices because the rootfs # has already been configured in the `basic_config()`function. - for _ in range(MAX_DEVICES_ATTACHED - 1): + for _ in range(max_devices_attached - 1): test_microvm.add_net_iface() test_microvm.start() # Test that network devices attached are operational. - for i in range(MAX_DEVICES_ATTACHED - 1): + for i in range(max_devices_attached - 1): # Verify if guest can run commands. test_microvm.ssh_iface(i).check_output("sync") -def test_attach_too_many_devices(microvm_factory, guest_kernel, rootfs): +def test_attach_too_many_devices(microvm_factory, guest_kernel, rootfs, pci_enabled): """ Test attaching to a microVM more devices than available IRQs. """ - if MAX_DEVICES_ATTACHED is None: - pytest.skip("Unsupported platform for this test.") - test_microvm = microvm_factory.build(guest_kernel, rootfs, monitor_memory=False) - test_microvm.spawn() + test_microvm.spawn(pci=pci_enabled) # Set up a basic microVM. test_microvm.basic_config() + max_devices_attached = max_devices(test_microvm) + # Add `MAX_DEVICES_ATTACHED` network devices on top of the # already configured rootfs. - for _ in range(MAX_DEVICES_ATTACHED): + for _ in range(max_devices_attached): test_microvm.add_net_iface() # Attempting to start a microVM with more than # `MAX_DEVICES_ATTACHED` devices should fail. error_str = ( - "Failed to allocate requested resource: The requested resource" - " is not available." + ("Could not find an available device slot on the PCI bus.") + if pci_enabled + else ( + "Failed to allocate requested resource: The requested resource" + " is not available." + ) ) with pytest.raises(RuntimeError, match=error_str): test_microvm.start() From b4ea6cd2cad83e96d4f0660b854ca70d3d308277 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Wed, 16 Jul 2025 09:59:24 +0100 Subject: [PATCH 62/99] refactor(mmio): rename irq to gsi in MMIODeviceInfo To have a more consistent naming, it's best to use GSI instead of IRQ, at least in places where it's meant just as an abstract index. Signed-off-by: Riccardo Mancini Signed-off-by: Babis Chalios --- src/vmm/src/arch/aarch64/fdt.rs | 4 ++-- src/vmm/src/device_manager/mmio.rs | 32 +++++++++++++++--------------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/src/vmm/src/arch/aarch64/fdt.rs b/src/vmm/src/arch/aarch64/fdt.rs index 429153669fa..6a50c0257a9 100644 --- a/src/vmm/src/arch/aarch64/fdt.rs +++ b/src/vmm/src/arch/aarch64/fdt.rs @@ -379,7 +379,7 @@ fn create_virtio_node(fdt: &mut FdtWriter, dev_info: &MMIODeviceInfo) -> Result< "interrupts", &[ GIC_FDT_IRQ_TYPE_SPI, - dev_info.irq.unwrap(), + dev_info.gsi.unwrap(), IRQ_TYPE_EDGE_RISING, ], )?; @@ -400,7 +400,7 @@ fn create_serial_node(fdt: &mut FdtWriter, dev_info: &MMIODeviceInfo) -> Result< "interrupts", &[ GIC_FDT_IRQ_TYPE_SPI, - dev_info.irq.unwrap(), + dev_info.gsi.unwrap(), IRQ_TYPE_EDGE_RISING, ], )?; diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 0cbc35535c9..a87646b11cf 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -73,8 +73,8 @@ pub struct MMIODeviceInfo { pub addr: u64, /// Mmio addr range length. pub len: u64, - /// Used Irq line for the device. - pub irq: Option, + /// Used GSI (interrupt line) for the device. + pub gsi: Option, } #[cfg(target_arch = "x86_64")] @@ -169,7 +169,7 @@ impl MMIODeviceManager { AllocPolicy::FirstMatch, )?, len: MMIO_LEN, - irq: gsi, + gsi, }; Ok(device_info) } @@ -183,7 +183,7 @@ impl MMIODeviceManager { ) -> Result<(), MmioError> { // Our virtio devices are currently hardcoded to use a single IRQ. // Validate that requirement. - let gsi = device.resources.irq.ok_or(MmioError::InvalidIrqConfig)?; + let gsi = device.resources.gsi.ok_or(MmioError::InvalidIrqConfig)?; let identifier; { let mmio_device = device.inner.lock().expect("Poisoned lock"); @@ -226,7 +226,7 @@ impl MMIODeviceManager { .add_virtio_mmio_device( device_info.len, GuestAddress(device_info.addr), - device_info.irq.unwrap(), + device_info.gsi.unwrap(), None, ) .map_err(MmioError::Cmdline) @@ -255,7 +255,7 @@ impl MMIODeviceManager { device.resources.len, // We are sure that `irqs` has at least one element; allocate_mmio_resources makes // sure of it. - device.resources.irq.unwrap(), + device.resources.gsi.unwrap(), )?; } self.register_mmio_virtio(vm, device_id, device)?; @@ -280,13 +280,13 @@ impl MMIODeviceManager { MMIODeviceInfo { addr: SERIAL_MEM_START, len: MMIO_LEN, - irq: Some(gsi[0]), + gsi: Some(gsi[0]), } }; vm.register_irq( serial.lock().expect("Poisoned lock").serial.interrupt_evt(), - device_info.irq.unwrap(), + device_info.gsi.unwrap(), ) .map_err(MmioError::RegisterIrqFd)?; @@ -339,7 +339,7 @@ impl MMIODeviceManager { MMIODeviceInfo { addr: RTC_MEM_START, len: MMIO_LEN, - irq: Some(gsi[0]), + gsi: Some(gsi[0]), } }; @@ -367,7 +367,7 @@ impl MMIODeviceManager { let device_info = MMIODeviceInfo { addr: BOOT_DEVICE_MEM_START, len: MMIO_LEN, - irq: None, + gsi: None, }; let device = MMIODevice { @@ -496,7 +496,7 @@ pub(crate) mod tests { pub fn used_irqs_count(&self) -> usize { self.virtio_devices .iter() - .filter(|(_, mmio_dev)| mmio_dev.resources.irq.is_some()) + .filter(|(_, mmio_dev)| mmio_dev.resources.gsi.is_some()) .count() } } @@ -612,7 +612,7 @@ pub(crate) mod tests { let dev = device_manager.get_virtio_device(0, "dummy").unwrap(); assert_eq!(dev.resources.addr, arch::MEM_32BIT_DEVICES_START); assert_eq!(dev.resources.len, MMIO_LEN); - assert_eq!(dev.resources.irq, Some(arch::GSI_LEGACY_START)); + assert_eq!(dev.resources.gsi, Some(arch::GSI_LEGACY_START)); device_manager .for_each_virtio_device(|virtio_type, device_id, mmio_device| { @@ -620,7 +620,7 @@ pub(crate) mod tests { assert_eq!(device_id, "dummy"); assert_eq!(mmio_device.resources.addr, arch::MEM_32BIT_DEVICES_START); assert_eq!(mmio_device.resources.len, MMIO_LEN); - assert_eq!(mmio_device.resources.irq, Some(arch::GSI_LEGACY_START)); + assert_eq!(mmio_device.resources.gsi, Some(arch::GSI_LEGACY_START)); Ok::<(), ()>(()) }) .unwrap(); @@ -714,7 +714,7 @@ pub(crate) mod tests { crate::arch::GSI_LEGACY_START, device_manager.virtio_devices[&(type_id, id)] .resources - .irq + .gsi .unwrap() ); @@ -751,7 +751,7 @@ pub(crate) mod tests { let device_info = device_manager .allocate_mmio_resources(&mut resource_allocator, 0) .unwrap(); - assert!(device_info.irq.is_none()); + assert!(device_info.gsi.is_none()); } #[test] @@ -762,7 +762,7 @@ pub(crate) mod tests { let device_info = device_manager .allocate_mmio_resources(&mut resource_allocator, 1) .unwrap(); - assert_eq!(device_info.irq.unwrap(), crate::arch::GSI_LEGACY_START); + assert_eq!(device_info.gsi.unwrap(), crate::arch::GSI_LEGACY_START); } #[test] From 01a3ef066dfd19fbd8137d7ccfe9fc3f8996e60e Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Thu, 10 Jul 2025 10:46:44 +0100 Subject: [PATCH 63/99] test(pci): make test_net_change_mac_address pass with PCI devices This patch makes 2 changes to make the test work on PCI: - simplify logic to find device address to be generic irrespective of ACPI/no-ACPI, PCI/no-PCI - move config offset from within the C program to the python test, as it's different between MMIO (0x100) and PCI (0x4000) Signed-off-by: Riccardo Mancini Signed-off-by: Babis Chalios --- tests/host_tools/change_net_config_space.c | 16 +-- .../functional/test_net_config_space.py | 110 +++--------------- 2 files changed, 27 insertions(+), 99 deletions(-) diff --git a/tests/host_tools/change_net_config_space.c b/tests/host_tools/change_net_config_space.c index 7b803bdc878..592a0cfe6af 100644 --- a/tests/host_tools/change_net_config_space.c +++ b/tests/host_tools/change_net_config_space.c @@ -14,7 +14,7 @@ #include int show_usage() { - printf("Usage: ./change_net_config_space.bin [dev_addr_start] [mac_addr]\n"); + printf("Usage: ./change_net_config_space.bin [dev_addr] [mac_addr]\n"); printf("Example:\n"); printf("> ./change_net_config_space.bin 0xd00001000 0x060504030201\n"); return 0; @@ -25,18 +25,17 @@ int main(int argc, char *argv[]) { uint8_t *map_base; volatile uint8_t *virt_addr; - uint64_t mapped_size, page_size, offset_in_page, target; + uint64_t mapped_size, page_size, page_addr, offset_in_page; uint64_t width = 6; - uint64_t config_offset = 0x100; - uint64_t device_start_addr = 0x00000000; + uint64_t dev_addr = 0x00000000; uint64_t mac = 0; if (argc != 3) { return show_usage(); } - device_start_addr = strtoull(argv[1], NULL, 0); + dev_addr = strtoull(argv[1], NULL, 0); mac = strtoull(argv[2], NULL, 0); fd = open("/dev/mem", O_RDWR | O_SYNC); @@ -45,11 +44,11 @@ int main(int argc, char *argv[]) { return 1; } - target = device_start_addr + config_offset; // Get the page size. mapped_size = page_size = getpagesize(); // Get the target address physical frame page offset. - offset_in_page = (unsigned) target & (page_size - 1); + offset_in_page = (unsigned) dev_addr & (page_size - 1); + page_addr = dev_addr & ~(page_size - 1); /* If the data length goes out of the current page, * double the needed map size. */ if (offset_in_page + width > page_size) { @@ -64,7 +63,8 @@ int main(int argc, char *argv[]) { PROT_READ | PROT_WRITE, MAP_SHARED, fd, - target & ~(off_t)(page_size - 1)); + page_addr + ); if (map_base == MAP_FAILED) { perror("Failed to mmap '/dev/mem'."); return 2; diff --git a/tests/integration_tests/functional/test_net_config_space.py b/tests/integration_tests/functional/test_net_config_space.py index c4ddfea9189..23ec0ba07b5 100644 --- a/tests/integration_tests/functional/test_net_config_space.py +++ b/tests/integration_tests/functional/test_net_config_space.py @@ -2,9 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 """Tests on devices config space.""" -import platform import random -import re import string import subprocess from threading import Thread @@ -15,14 +13,16 @@ PAYLOAD_DATA_SIZE = 20 -def test_net_change_mac_address(uvm_plain_any, change_net_config_space_bin): +def test_net_change_mac_address( + uvm_plain_any, pci_enabled, change_net_config_space_bin +): """ Test changing the MAC address of the network device. """ test_microvm = uvm_plain_any test_microvm.help.enable_console() - test_microvm.spawn() + test_microvm.spawn(pci=pci_enabled) test_microvm.basic_config(boot_args="ipv6.disable=1") # Data exchange interface ('eth0' in guest). @@ -64,6 +64,8 @@ def test_net_change_mac_address(uvm_plain_any, change_net_config_space_bin): net_addr_base = _get_net_mem_addr_base(ssh_conn, guest_if1_name) assert net_addr_base is not None + config_offset = 0x4000 if test_microvm.pci_enabled else 0x100 + dev_addr = net_addr_base + config_offset # Write into '/dev/mem' the same mac address, byte by byte. # This changes the MAC address physically, in the network device registers. @@ -72,7 +74,7 @@ def test_net_change_mac_address(uvm_plain_any, change_net_config_space_bin): # `tx_spoofed_mac_count` metric shouldn't be incremented later on. rmt_path = "/tmp/change_net_config_space" test_microvm.ssh.scp_put(change_net_config_space_bin, rmt_path) - cmd = f"chmod u+x {rmt_path} && {rmt_path} {net_addr_base} {mac_hex}" + cmd = f"chmod u+x {rmt_path} && {rmt_path} {dev_addr} {mac_hex}" # This should be executed successfully. _, stdout, _ = ssh_conn.check_output(cmd) @@ -219,8 +221,7 @@ def _find_iomem_range(ssh_connection, dev_name): # its contents and grep for the VirtIO device name, which # with ACPI is "LNRO0005:XY". cmd = f"cat /proc/iomem | grep -m 1 {dev_name}" - rc, stdout, stderr = ssh_connection.run(cmd) - assert rc == 0, stderr + _, stdout, _ = ssh_connection.check_output(cmd) # Take range in the form 'start-end' from line. The line looks like this: # d00002000-d0002fff : LNRO0005:02 @@ -231,89 +232,16 @@ def _find_iomem_range(ssh_connection, dev_name): return (int(tokens[0], 16), int(tokens[1], 16)) -def _get_net_mem_addr_base_x86_acpi(ssh_connection, if_name): - """Check for net device memory start address via ACPI info""" - # On x86 we define VirtIO devices through ACPI AML bytecode. VirtIO devices - # are identified as "LNRO0005" and appear under /sys/devices/platform - sys_virtio_mmio_cmdline = "/sys/devices/platform/" - cmd = "ls {}" - _, stdout, _ = ssh_connection.check_output(cmd.format(sys_virtio_mmio_cmdline)) - virtio_devs = list(filter(lambda x: "LNRO0005" in x, stdout.strip().split())) - - # For virtio-net LNRO0005 devices, we should have a path like: - # /sys/devices/platform/LNRO0005::XY/virtioXY/net which is a directory - # that includes a subdirectory `ethZ` which represents the network device - # that corresponds to the virtio-net device. - cmd = "ls {}/{}/virtio{}/net" - for idx, dev in enumerate(virtio_devs): - _, guest_if_name, _ = ssh_connection.run( - cmd.format(sys_virtio_mmio_cmdline, dev, idx) - ) - if guest_if_name.strip() == if_name: - return _find_iomem_range(ssh_connection, dev)[0] - - return None - - -def _get_net_mem_addr_base_x86_cmdline(ssh_connection, if_name): - """Check for net device memory start address via command line arguments""" - sys_virtio_mmio_cmdline = "/sys/devices/virtio-mmio-cmdline/" - cmd = "ls {} | grep virtio-mmio. | sed 's/virtio-mmio.//'" - exit_code, stdout, stderr = ssh_connection.run(cmd.format(sys_virtio_mmio_cmdline)) - assert exit_code == 0, stderr - virtio_devs_idx = stdout.strip().split() - - cmd = "cat /proc/cmdline" - _, cmd_line, _ = ssh_connection.check_output(cmd) - pattern_dev = re.compile("(virtio_mmio.device=4K@0x[0-9a-f]+:[0-9]+)+") - pattern_addr = re.compile("virtio_mmio.device=4K@(0x[0-9a-f]+):[0-9]+") - devs_addr = [] - for dev in re.findall(pattern_dev, cmd_line): - matched_addr = pattern_addr.search(dev) - # The 1st group which matches this pattern - # is the device start address. `0` group is - # full match - addr = matched_addr.group(1) - devs_addr.append(addr) - - cmd = "ls {}/virtio-mmio.{}/virtio{}/net" - for idx in virtio_devs_idx: - _, guest_if_name, _ = ssh_connection.run( - cmd.format(sys_virtio_mmio_cmdline, idx, idx) - ) - if guest_if_name.strip() == if_name: - return devs_addr[int(idx)] - - return None - - def _get_net_mem_addr_base(ssh_connection, if_name): """Get the net device memory start address.""" - if platform.machine() == "x86_64": - acpi_info = _get_net_mem_addr_base_x86_acpi(ssh_connection, if_name) - if acpi_info is not None: - return acpi_info - - return _get_net_mem_addr_base_x86_cmdline(ssh_connection, if_name) - - if platform.machine() == "aarch64": - sys_virtio_mmio_cmdline = "/sys/devices/platform" - cmd = "ls {} | grep .virtio_mmio".format(sys_virtio_mmio_cmdline) - rc, stdout, _ = ssh_connection.run(cmd) - assert rc == 0 - - virtio_devs = stdout.split() - devs_addr = list(map(lambda dev: dev.split(".")[0], virtio_devs)) - - cmd = "ls {}/{}/virtio{}/net" - # Device start addresses lack the hex prefix and are not interpreted - # accordingly when parsed inside `change_config_space.c`. - hex_prefix = "0x" - for idx, dev in enumerate(virtio_devs): - _, guest_if_name, _ = ssh_connection.run( - cmd.format(sys_virtio_mmio_cmdline, dev, idx) - ) - if guest_if_name.strip() == if_name: - return hex_prefix + devs_addr[int(idx)] - - return None + _, stdout, _ = ssh_connection.check_output(f"find /sys/devices -name {if_name}") + device_paths = stdout.strip().split("\n") + assert ( + len(device_paths) == 1 + ), f"No or multiple devices found for {if_name}:\n{stdout}" + device_path = device_paths[0] + parts = device_path.split("/") + assert len(parts) >= 6, f"Unexpected device path: {device_path}" + device = parts[-4] + start_addr, _ = _find_iomem_range(ssh_connection, device) + return start_addr From a1a065fd5d9c1d1111082d11e181c275e856ceb5 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Wed, 9 Jul 2025 17:15:26 +0100 Subject: [PATCH 64/99] test(rootfs): disable predictable netif names Tell systemd not to use "predictable names" for network devices (eg enp0s1), but keep the ethN set by the kernel. This is equivalent to passing net.ifnames=0 to the kernel command line. Signed-off-by: Riccardo Mancini Signed-off-by: Babis Chalios --- resources/chroot.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/resources/chroot.sh b/resources/chroot.sh index f87ae4aea08..7aadeddb884 100755 --- a/resources/chroot.sh +++ b/resources/chroot.sh @@ -64,6 +64,10 @@ rm -vf /etc/systemd/system/timers.target.wants/* systemctl enable var-lib-systemd.mount +# disable Predictable Network Interface Names to keep ethN names +# even with PCI enabled +ln -s /dev/null /etc/systemd/network/99-default.link + #### trim image https://wiki.ubuntu.com/ReducingDiskFootprint # this does not save much, but oh well rm -rf /usr/share/{doc,man,info,locale} From 54d27b9fccbe2b83f0acdf07dfa4f01aeaec9d2b Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Tue, 8 Jul 2025 12:15:22 +0100 Subject: [PATCH 65/99] test(pci): remove pci=off command line from tests pci=off is just an optimization to skip the probing, it shouldn't matter to the functionality of the tests. Dropping it to allow them to run with PCI. Signed-off-by: Riccardo Mancini Signed-off-by: Babis Chalios --- tests/framework/vm_config.json | 2 +- tests/framework/vm_config_cpu_template_C3.json | 2 +- tests/framework/vm_config_missing_mem_size_mib.json | 2 +- tests/framework/vm_config_missing_vcpu_count.json | 2 +- tests/framework/vm_config_network.json | 2 +- tests/framework/vm_config_smt_true.json | 2 +- tests/framework/vm_config_with_mmdsv1.json | 2 +- tests/framework/vm_config_with_mmdsv2.json | 2 +- tests/integration_tests/functional/test_error_code.py | 2 +- .../functional/test_kernel_cmdline.py | 3 +-- tests/integration_tests/functional/test_serial_io.py | 10 ++++------ tests/integration_tests/performance/test_boottime.py | 3 +-- tests/integration_tests/performance/test_initrd.py | 2 +- 13 files changed, 16 insertions(+), 20 deletions(-) diff --git a/tests/framework/vm_config.json b/tests/framework/vm_config.json index 5df673308d9..6948002e245 100644 --- a/tests/framework/vm_config.json +++ b/tests/framework/vm_config.json @@ -1,7 +1,7 @@ { "boot-source": { "kernel_image_path": "vmlinux.bin", - "boot_args": "console=ttyS0 reboot=k panic=1 pci=off", + "boot_args": "console=ttyS0 reboot=k panic=1", "initrd_path": null }, "drives": [ diff --git a/tests/framework/vm_config_cpu_template_C3.json b/tests/framework/vm_config_cpu_template_C3.json index 3b842594a18..b6dbf124022 100644 --- a/tests/framework/vm_config_cpu_template_C3.json +++ b/tests/framework/vm_config_cpu_template_C3.json @@ -1,7 +1,7 @@ { "boot-source": { "kernel_image_path": "vmlinux.bin", - "boot_args": "console=ttyS0 reboot=k panic=1 pci=off" + "boot_args": "console=ttyS0 reboot=k panic=1" }, "drives": [ { diff --git a/tests/framework/vm_config_missing_mem_size_mib.json b/tests/framework/vm_config_missing_mem_size_mib.json index 15ff19fa1b3..ea20d152473 100644 --- a/tests/framework/vm_config_missing_mem_size_mib.json +++ b/tests/framework/vm_config_missing_mem_size_mib.json @@ -1,7 +1,7 @@ { "boot-source": { "kernel_image_path": "vmlinux.bin", - "boot_args": "console=ttyS0 reboot=k panic=1 pci=off" + "boot_args": "console=ttyS0 reboot=k panic=1" }, "drives": [ { diff --git a/tests/framework/vm_config_missing_vcpu_count.json b/tests/framework/vm_config_missing_vcpu_count.json index b5aac05ddd2..39bb6a38954 100644 --- a/tests/framework/vm_config_missing_vcpu_count.json +++ b/tests/framework/vm_config_missing_vcpu_count.json @@ -1,7 +1,7 @@ { "boot-source": { "kernel_image_path": "vmlinux.bin", - "boot_args": "console=ttyS0 reboot=k panic=1 pci=off" + "boot_args": "console=ttyS0 reboot=k panic=1" }, "drives": [ { diff --git a/tests/framework/vm_config_network.json b/tests/framework/vm_config_network.json index a081e4f6990..7e25823cd66 100644 --- a/tests/framework/vm_config_network.json +++ b/tests/framework/vm_config_network.json @@ -1,7 +1,7 @@ { "boot-source": { "kernel_image_path": "vmlinux.bin", - "boot_args": "console=ttyS0 reboot=k panic=1 pci=off", + "boot_args": "console=ttyS0 reboot=k panic=1", "initrd_path": null }, "drives": [ diff --git a/tests/framework/vm_config_smt_true.json b/tests/framework/vm_config_smt_true.json index 3a1b79a1752..383bf68519a 100644 --- a/tests/framework/vm_config_smt_true.json +++ b/tests/framework/vm_config_smt_true.json @@ -1,7 +1,7 @@ { "boot-source": { "kernel_image_path": "vmlinux.bin", - "boot_args": "console=ttyS0 reboot=k panic=1 pci=off" + "boot_args": "console=ttyS0 reboot=k panic=1" }, "drives": [ { diff --git a/tests/framework/vm_config_with_mmdsv1.json b/tests/framework/vm_config_with_mmdsv1.json index 6c30e535b1d..30f67ff5bfa 100644 --- a/tests/framework/vm_config_with_mmdsv1.json +++ b/tests/framework/vm_config_with_mmdsv1.json @@ -1,7 +1,7 @@ { "boot-source": { "kernel_image_path": "vmlinux.bin", - "boot_args": "console=ttyS0 reboot=k panic=1 pci=off", + "boot_args": "console=ttyS0 reboot=k panic=1", "initrd_path": null }, "drives": [ diff --git a/tests/framework/vm_config_with_mmdsv2.json b/tests/framework/vm_config_with_mmdsv2.json index b5855b9faa4..f766129f02f 100644 --- a/tests/framework/vm_config_with_mmdsv2.json +++ b/tests/framework/vm_config_with_mmdsv2.json @@ -1,7 +1,7 @@ { "boot-source": { "kernel_image_path": "vmlinux.bin", - "boot_args": "console=ttyS0 reboot=k panic=1 pci=off", + "boot_args": "console=ttyS0 reboot=k panic=1", "initrd_path": null }, "drives": [ diff --git a/tests/integration_tests/functional/test_error_code.py b/tests/integration_tests/functional/test_error_code.py index d1a74b6f418..171d3853460 100644 --- a/tests/integration_tests/functional/test_error_code.py +++ b/tests/integration_tests/functional/test_error_code.py @@ -25,7 +25,7 @@ def test_enosys_error_code(uvm_plain): vm.memory_monitor = None vm.basic_config( vcpu_count=1, - boot_args="reboot=k panic=1 pci=off init=/usr/local/bin/devmemread", + boot_args="reboot=k panic=1 init=/usr/local/bin/devmemread", ) vm.start() diff --git a/tests/integration_tests/functional/test_kernel_cmdline.py b/tests/integration_tests/functional/test_kernel_cmdline.py index 9707eb8a92c..7ba345f2111 100644 --- a/tests/integration_tests/functional/test_kernel_cmdline.py +++ b/tests/integration_tests/functional/test_kernel_cmdline.py @@ -21,8 +21,7 @@ def test_init_params(uvm_plain): # Ubuntu version from the /etc/issue file. vm.basic_config( vcpu_count=1, - boot_args="console=ttyS0 reboot=k panic=1 pci=off" - " init=/bin/cat -- /etc/issue", + boot_args="console=ttyS0 reboot=k panic=1 init=/bin/cat -- /etc/issue", ) vm.start() diff --git a/tests/integration_tests/functional/test_serial_io.py b/tests/integration_tests/functional/test_serial_io.py index 01900ec55e0..9005d0896b3 100644 --- a/tests/integration_tests/functional/test_serial_io.py +++ b/tests/integration_tests/functional/test_serial_io.py @@ -55,7 +55,7 @@ def test_serial_after_snapshot(uvm_plain, microvm_factory): microvm.basic_config( vcpu_count=2, mem_size_mib=256, - boot_args="console=ttyS0 reboot=k panic=1 pci=off", + boot_args="console=ttyS0 reboot=k panic=1", ) serial = Serial(microvm) serial.open() @@ -99,9 +99,7 @@ def test_serial_console_login(uvm_plain_any): microvm.memory_monitor = None # Set up the microVM with 1 vCPU and a serial console. - microvm.basic_config( - vcpu_count=1, boot_args="console=ttyS0 reboot=k panic=1 pci=off" - ) + microvm.basic_config(vcpu_count=1, boot_args="console=ttyS0 reboot=k panic=1") microvm.start() @@ -146,7 +144,7 @@ def test_serial_dos(uvm_plain_any): # Set up the microVM with 1 vCPU and a serial console. microvm.basic_config( vcpu_count=1, - boot_args="console=ttyS0 reboot=k panic=1 pci=off", + boot_args="console=ttyS0 reboot=k panic=1", ) microvm.add_net_iface() microvm.start() @@ -180,7 +178,7 @@ def test_serial_block(uvm_plain_any): test_microvm.basic_config( vcpu_count=1, mem_size_mib=512, - boot_args="console=ttyS0 reboot=k panic=1 pci=off", + boot_args="console=ttyS0 reboot=k panic=1", ) test_microvm.add_net_iface() test_microvm.start() diff --git a/tests/integration_tests/performance/test_boottime.py b/tests/integration_tests/performance/test_boottime.py index 7b7bd2a506a..0d8ff394d25 100644 --- a/tests/integration_tests/performance/test_boottime.py +++ b/tests/integration_tests/performance/test_boottime.py @@ -98,14 +98,13 @@ def launch_vm_with_boot_timer( microvm_factory, guest_kernel_acpi, rootfs_rw, vcpu_count, mem_size_mib, pci_enabled ): """Launches a microVM with guest-timer and returns the reported metrics for it""" - boot_args = DEFAULT_BOOT_ARGS if pci_enabled else DEFAULT_BOOT_ARGS + " pci=off" vm = microvm_factory.build(guest_kernel_acpi, rootfs_rw) vm.jailer.extra_args.update({"boot-timer": None}) vm.spawn(pci=pci_enabled) vm.basic_config( vcpu_count=vcpu_count, mem_size_mib=mem_size_mib, - boot_args=boot_args + " init=/usr/local/bin/init", + boot_args=DEFAULT_BOOT_ARGS + " init=/usr/local/bin/init", enable_entropy_device=True, ) vm.add_net_iface() diff --git a/tests/integration_tests/performance/test_initrd.py b/tests/integration_tests/performance/test_initrd.py index 3845e5610c0..0db8578a5ef 100644 --- a/tests/integration_tests/performance/test_initrd.py +++ b/tests/integration_tests/performance/test_initrd.py @@ -33,7 +33,7 @@ def test_microvm_initrd_with_serial(uvm_with_initrd, huge_pages): vm.basic_config( add_root_device=False, vcpu_count=1, - boot_args="console=ttyS0 reboot=k panic=1 pci=off", + boot_args="console=ttyS0 reboot=k panic=1", use_initrd=True, huge_pages=huge_pages, ) From 387407526d0aa8ae3bf19a407c58faed2dccd323 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Tue, 8 Jul 2025 12:15:22 +0100 Subject: [PATCH 66/99] test(pci): parametrize uvm_plain* with pci All tests using uvm_plain or uvm_plain_any will start using PCI as well, allowing more coverage for the PCI code. This requires moving the PCI configuration to the VM factory from the spawn method. Signed-off-by: Riccardo Mancini Signed-off-by: Babis Chalios --- tests/conftest.py | 12 +++++----- tests/framework/microvm.py | 11 +++++----- .../functional/test_max_devices.py | 16 ++++++++------ .../functional/test_net_config_space.py | 6 ++--- .../integration_tests/functional/test_rng.py | 21 +++++++----------- .../functional/test_vsock.py | 22 +++++++++---------- .../performance/test_block.py | 6 +++-- .../performance/test_boottime.py | 4 ++-- .../performance/test_network.py | 6 +++-- .../performance/test_vsock.py | 6 +++-- 10 files changed, 55 insertions(+), 55 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index bf4cb5c3649..369f164eec7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -509,9 +509,9 @@ def rootfs_rw(): @pytest.fixture -def uvm_plain(microvm_factory, guest_kernel_linux_5_10, rootfs): +def uvm_plain(microvm_factory, guest_kernel_linux_5_10, rootfs, pci_enabled): """Create a vanilla VM, non-parametrized""" - return microvm_factory.build(guest_kernel_linux_5_10, rootfs) + return microvm_factory.build(guest_kernel_linux_5_10, rootfs, pci=pci_enabled) @pytest.fixture @@ -537,12 +537,12 @@ def artifact_dir(): @pytest.fixture -def uvm_plain_any(microvm_factory, guest_kernel, rootfs): +def uvm_plain_any(microvm_factory, guest_kernel, rootfs, pci_enabled): """All guest kernels kernel: all rootfs: Ubuntu 24.04 """ - return microvm_factory.build(guest_kernel, rootfs) + return microvm_factory.build(guest_kernel, rootfs, pci=pci_enabled) guest_kernel_6_1_debug = pytest.fixture( @@ -585,8 +585,8 @@ def uvm_booted( mem_size_mib=256, ): """Return a booted uvm""" - uvm = microvm_factory.build(guest_kernel, rootfs) - uvm.spawn(pci=pci_enabled) + uvm = microvm_factory.build(guest_kernel, rootfs, pci=pci_enabled) + uvm.spawn() uvm.basic_config(vcpu_count=vcpu_count, mem_size_mib=mem_size_mib) uvm.set_cpu_template(cpu_template) uvm.add_net_iface() diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py index 3858886585a..45850cce211 100644 --- a/tests/framework/microvm.py +++ b/tests/framework/microvm.py @@ -206,6 +206,7 @@ def __init__( jailer_kwargs: Optional[dict] = None, numa_node=None, custom_cpu_template: Path = None, + pci: bool = False, ): """Set up microVM attributes, paths, and data structures.""" # pylint: disable=too-many-statements @@ -213,7 +214,6 @@ def __init__( assert microvm_id is not None self._microvm_id = microvm_id - self.pci_enabled = False self.kernel_file = None self.rootfs_file = None self.ssh_key = None @@ -237,6 +237,10 @@ def __init__( **jailer_kwargs, ) + self.pci_enabled = pci + if pci: + self.jailer.extra_args["enable-pci"] = None + # Copy the /etc/localtime file in the jailer root self.jailer.jailed_path("/etc/localtime", subdir="etc") @@ -635,7 +639,6 @@ def spawn( log_show_origin=False, metrics_path="fc.ndjson", emit_metrics: bool = False, - pci: bool = False, ): """Start a microVM as a daemon or in a screen session.""" # pylint: disable=subprocess-run-check @@ -681,10 +684,6 @@ def spawn( # Checking the timings requires DEBUG level log messages self.time_api_requests = False - if pci: - self.pci_enabled = True - self.jailer.extra_args["enable-pci"] = None - cmd = [ *self._pre_cmd, str(self.jailer_binary_path), diff --git a/tests/integration_tests/functional/test_max_devices.py b/tests/integration_tests/functional/test_max_devices.py index 3c52127792d..7cf9922c77b 100644 --- a/tests/integration_tests/functional/test_max_devices.py +++ b/tests/integration_tests/functional/test_max_devices.py @@ -29,12 +29,13 @@ def max_devices(uvm): raise ValueError("Unknown platform") -def test_attach_maximum_devices(microvm_factory, guest_kernel, rootfs, pci_enabled): +def test_attach_maximum_devices(uvm_plain_any): """ Test attaching maximum number of devices to the microVM. """ - test_microvm = microvm_factory.build(guest_kernel, rootfs, monitor_memory=False) - test_microvm.spawn(pci=pci_enabled) + test_microvm = uvm_plain_any + test_microvm.memory_monitor = None + test_microvm.spawn() # The default 256mib is not enough for 94 ssh connections on aarch64. test_microvm.basic_config(mem_size_mib=512) @@ -52,12 +53,13 @@ def test_attach_maximum_devices(microvm_factory, guest_kernel, rootfs, pci_enabl test_microvm.ssh_iface(i).check_output("sync") -def test_attach_too_many_devices(microvm_factory, guest_kernel, rootfs, pci_enabled): +def test_attach_too_many_devices(uvm_plain): """ Test attaching to a microVM more devices than available IRQs. """ - test_microvm = microvm_factory.build(guest_kernel, rootfs, monitor_memory=False) - test_microvm.spawn(pci=pci_enabled) + test_microvm = uvm_plain + test_microvm.memory_monitor = None + test_microvm.spawn() # Set up a basic microVM. test_microvm.basic_config() @@ -73,7 +75,7 @@ def test_attach_too_many_devices(microvm_factory, guest_kernel, rootfs, pci_enab # `MAX_DEVICES_ATTACHED` devices should fail. error_str = ( ("Could not find an available device slot on the PCI bus.") - if pci_enabled + if test_microvm.pci_enabled else ( "Failed to allocate requested resource: The requested resource" " is not available." diff --git a/tests/integration_tests/functional/test_net_config_space.py b/tests/integration_tests/functional/test_net_config_space.py index 23ec0ba07b5..d58b49b6d4a 100644 --- a/tests/integration_tests/functional/test_net_config_space.py +++ b/tests/integration_tests/functional/test_net_config_space.py @@ -13,16 +13,14 @@ PAYLOAD_DATA_SIZE = 20 -def test_net_change_mac_address( - uvm_plain_any, pci_enabled, change_net_config_space_bin -): +def test_net_change_mac_address(uvm_plain_any, change_net_config_space_bin): """ Test changing the MAC address of the network device. """ test_microvm = uvm_plain_any test_microvm.help.enable_console() - test_microvm.spawn(pci=pci_enabled) + test_microvm.spawn() test_microvm.basic_config(boot_args="ipv6.disable=1") # Data exchange interface ('eth0' in guest). diff --git a/tests/integration_tests/functional/test_rng.py b/tests/integration_tests/functional/test_rng.py index f2acf96735a..8719472a121 100644 --- a/tests/integration_tests/functional/test_rng.py +++ b/tests/integration_tests/functional/test_rng.py @@ -8,12 +8,11 @@ from host_tools.network import SSHConnection -def uvm_with_rng_booted( - microvm_factory, guest_kernel, rootfs, rate_limiter, pci_enabled -): +def uvm_with_rng_booted(uvm_plain_any, microvm_factory, rate_limiter): """Return a booted microvm with virtio-rng configured""" - uvm = microvm_factory.build(guest_kernel, rootfs) - uvm.spawn(log_level="INFO", pci=pci_enabled) + # pylint: disable=unused-argument + uvm = uvm_plain_any + uvm.spawn(log_level="INFO") uvm.basic_config(vcpu_count=2, mem_size_mib=256) uvm.add_net_iface() uvm.api.entropy.put(rate_limiter=rate_limiter) @@ -23,13 +22,9 @@ def uvm_with_rng_booted( return uvm -def uvm_with_rng_restored( - microvm_factory, guest_kernel, rootfs, rate_limiter, pci_enabled -): +def uvm_with_rng_restored(uvm_plain_any, microvm_factory, rate_limiter): """Return a restored uvm with virtio-rng configured""" - uvm = uvm_with_rng_booted( - microvm_factory, guest_kernel, rootfs, rate_limiter, pci_enabled - ) + uvm = uvm_with_rng_booted(uvm_plain_any, microvm_factory, rate_limiter) snapshot = uvm.snapshot_full() uvm.kill() uvm2 = microvm_factory.build_from_snapshot(snapshot) @@ -50,9 +45,9 @@ def rate_limiter(request): @pytest.fixture -def uvm_any(microvm_factory, uvm_ctor, guest_kernel, rootfs, rate_limiter, pci_enabled): +def uvm_any(microvm_factory, uvm_ctor, uvm_plain_any, rate_limiter): """Return booted and restored uvms""" - return uvm_ctor(microvm_factory, guest_kernel, rootfs, rate_limiter, pci_enabled) + return uvm_ctor(uvm_plain_any, microvm_factory, rate_limiter) def list_rng_available(ssh_connection: SSHConnection) -> list[str]: diff --git a/tests/integration_tests/functional/test_vsock.py b/tests/integration_tests/functional/test_vsock.py index 5b6221c32a9..8c0d30700c6 100644 --- a/tests/integration_tests/functional/test_vsock.py +++ b/tests/integration_tests/functional/test_vsock.py @@ -37,7 +37,7 @@ TEST_WORKER_COUNT = 10 -def test_vsock(uvm_plain_any, pci_enabled, bin_vsock_path, test_fc_session_root_path): +def test_vsock(uvm_plain_any, bin_vsock_path, test_fc_session_root_path): """ Test guest and host vsock initiated connections. @@ -45,7 +45,7 @@ def test_vsock(uvm_plain_any, pci_enabled, bin_vsock_path, test_fc_session_root_ """ vm = uvm_plain_any - vm.spawn(pci=pci_enabled) + vm.spawn() vm.basic_config() vm.add_net_iface() @@ -102,12 +102,12 @@ def negative_test_host_connections(vm, blob_path, blob_hash): validate_fc_metrics(metrics) -def test_vsock_epipe(uvm_plain, pci_enabled, bin_vsock_path, test_fc_session_root_path): +def test_vsock_epipe(uvm_plain_any, bin_vsock_path, test_fc_session_root_path): """ Vsock negative test to validate SIGPIPE/EPIPE handling. """ - vm = uvm_plain - vm.spawn(pci=pci_enabled) + vm = uvm_plain_any + vm.spawn() vm.basic_config() vm.add_net_iface() vm.api.vsock.put(vsock_id="vsock0", guest_cid=3, uds_path=f"/{VSOCK_UDS_PATH}") @@ -129,7 +129,7 @@ def test_vsock_epipe(uvm_plain, pci_enabled, bin_vsock_path, test_fc_session_roo def test_vsock_transport_reset_h2g( - uvm_plain, pci_enabled, microvm_factory, bin_vsock_path, test_fc_session_root_path + uvm_plain_any, microvm_factory, bin_vsock_path, test_fc_session_root_path ): """ Vsock transport reset test. @@ -146,8 +146,8 @@ def test_vsock_transport_reset_h2g( 6. Close VM -> Load VM from Snapshot -> check that vsock device is still working. """ - test_vm = uvm_plain - test_vm.spawn(pci=pci_enabled) + test_vm = uvm_plain_any + test_vm.spawn() test_vm.basic_config(vcpu_count=2, mem_size_mib=256) test_vm.add_net_iface() test_vm.api.vsock.put(vsock_id="vsock0", guest_cid=3, uds_path=f"/{VSOCK_UDS_PATH}") @@ -215,12 +215,12 @@ def test_vsock_transport_reset_h2g( validate_fc_metrics(metrics) -def test_vsock_transport_reset_g2h(uvm_plain, pci_enabled, microvm_factory): +def test_vsock_transport_reset_g2h(uvm_plain_any, microvm_factory): """ Vsock transport reset test. """ - test_vm = uvm_plain - test_vm.spawn(pci=pci_enabled) + test_vm = uvm_plain_any + test_vm.spawn() test_vm.basic_config(vcpu_count=2, mem_size_mib=256) test_vm.add_net_iface() test_vm.api.vsock.put(vsock_id="vsock0", guest_cid=3, uds_path=f"/{VSOCK_UDS_PATH}") diff --git a/tests/integration_tests/performance/test_block.py b/tests/integration_tests/performance/test_block.py index 7fe9216e559..d88d2186ddf 100644 --- a/tests/integration_tests/performance/test_block.py +++ b/tests/integration_tests/performance/test_block.py @@ -176,8 +176,10 @@ def test_block_performance( """ Execute block device emulation benchmarking scenarios. """ - vm = microvm_factory.build(guest_kernel_acpi, rootfs, monitor_memory=False) - vm.spawn(log_level="Info", emit_metrics=True, pci=pci_enabled) + vm = microvm_factory.build( + guest_kernel_acpi, rootfs, monitor_memory=False, pci=pci_enabled + ) + vm.spawn(log_level="Info", emit_metrics=True) vm.basic_config(vcpu_count=vcpus, mem_size_mib=GUEST_MEM_MIB) vm.add_net_iface() # Add a secondary block device for benchmark tests. diff --git a/tests/integration_tests/performance/test_boottime.py b/tests/integration_tests/performance/test_boottime.py index 0d8ff394d25..0e533a43d08 100644 --- a/tests/integration_tests/performance/test_boottime.py +++ b/tests/integration_tests/performance/test_boottime.py @@ -98,9 +98,9 @@ def launch_vm_with_boot_timer( microvm_factory, guest_kernel_acpi, rootfs_rw, vcpu_count, mem_size_mib, pci_enabled ): """Launches a microVM with guest-timer and returns the reported metrics for it""" - vm = microvm_factory.build(guest_kernel_acpi, rootfs_rw) + vm = microvm_factory.build(guest_kernel_acpi, rootfs_rw, pci=pci_enabled) vm.jailer.extra_args.update({"boot-timer": None}) - vm.spawn(pci=pci_enabled) + vm.spawn() vm.basic_config( vcpu_count=vcpu_count, mem_size_mib=mem_size_mib, diff --git a/tests/integration_tests/performance/test_network.py b/tests/integration_tests/performance/test_network.py index 035fb5a2b59..cd2ab90dff3 100644 --- a/tests/integration_tests/performance/test_network.py +++ b/tests/integration_tests/performance/test_network.py @@ -45,8 +45,10 @@ def network_microvm(request, microvm_factory, guest_kernel_acpi, rootfs, pci_ena guest_mem_mib = 1024 guest_vcpus = request.param - vm = microvm_factory.build(guest_kernel_acpi, rootfs, monitor_memory=False) - vm.spawn(log_level="Info", emit_metrics=True, pci=pci_enabled) + vm = microvm_factory.build( + guest_kernel_acpi, rootfs, monitor_memory=False, pci=pci_enabled + ) + vm.spawn(log_level="Info", emit_metrics=True) vm.basic_config(vcpu_count=guest_vcpus, mem_size_mib=guest_mem_mib) vm.add_net_iface() vm.start() diff --git a/tests/integration_tests/performance/test_vsock.py b/tests/integration_tests/performance/test_vsock.py index 5a023f53eea..076e0b41da0 100644 --- a/tests/integration_tests/performance/test_vsock.py +++ b/tests/integration_tests/performance/test_vsock.py @@ -95,8 +95,10 @@ def test_vsock_throughput( pytest.skip("bidrectional test only done with at least 2 vcpus") mem_size_mib = 1024 - vm = microvm_factory.build(guest_kernel_acpi, rootfs, monitor_memory=False) - vm.spawn(log_level="Info", emit_metrics=True, pci=pci_enabled) + vm = microvm_factory.build( + guest_kernel_acpi, rootfs, monitor_memory=False, pci=pci_enabled + ) + vm.spawn(log_level="Info", emit_metrics=True) vm.basic_config(vcpu_count=vcpus, mem_size_mib=mem_size_mib) vm.add_net_iface() # Create a vsock device From 4c25c99cd4362b48a8e0fa61ba1bae7edc427739 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Tue, 15 Jul 2025 15:45:27 +0100 Subject: [PATCH 67/99] refactor(test): use uvm_plain* fixture instead factory.build This patch updates all the places in the code with a uvm_plain* fixture when that was equivalent to the previous behaviour. In particular: - microvm_factory.build(guest_kernel_linux_5_10, rootfs) => uvm_plain - microvm_factory.build(guest_kernel, rootfs) => uvm_plain_any Signed-off-by: Riccardo Mancini Signed-off-by: Babis Chalios --- .../functional/test_balloon.py | 8 ++++---- .../functional/test_cpu_template_helper.py | 6 ++---- .../functional/test_feat_parity.py | 9 ++------- .../functional/test_snapshot_basic.py | 17 ++++++++--------- .../performance/test_huge_pages.py | 11 ++++------- .../performance/test_snapshot.py | 6 ++---- 6 files changed, 22 insertions(+), 35 deletions(-) diff --git a/tests/integration_tests/functional/test_balloon.py b/tests/integration_tests/functional/test_balloon.py index d23dc0785cb..314cd9b5afd 100644 --- a/tests/integration_tests/functional/test_balloon.py +++ b/tests/integration_tests/functional/test_balloon.py @@ -449,11 +449,11 @@ def test_stats_update(uvm_plain_any): assert next_stats["available_memory"] != final_stats["available_memory"] -def test_balloon_snapshot(microvm_factory, guest_kernel, rootfs): +def test_balloon_snapshot(uvm_plain_any, microvm_factory): """ Test that the balloon works after pause/resume. """ - vm = microvm_factory.build(guest_kernel, rootfs) + vm = uvm_plain_any vm.spawn() vm.basic_config( vcpu_count=2, @@ -531,11 +531,11 @@ def test_balloon_snapshot(microvm_factory, guest_kernel, rootfs): assert stats_after_snap["available_memory"] > latest_stats["available_memory"] -def test_memory_scrub(microvm_factory, guest_kernel, rootfs): +def test_memory_scrub(uvm_plain_any): """ Test that the memory is zeroed after deflate. """ - microvm = microvm_factory.build(guest_kernel, rootfs) + microvm = uvm_plain_any microvm.spawn() microvm.basic_config(vcpu_count=2, mem_size_mib=256) microvm.add_net_iface() diff --git a/tests/integration_tests/functional/test_cpu_template_helper.py b/tests/integration_tests/functional/test_cpu_template_helper.py index e4c087fa497..74f5c96cd47 100644 --- a/tests/integration_tests/functional/test_cpu_template_helper.py +++ b/tests/integration_tests/functional/test_cpu_template_helper.py @@ -266,9 +266,7 @@ def get_guest_msrs(microvm, msr_index_list): ), ) def test_cpu_config_dump_vs_actual( - microvm_factory, - guest_kernel, - rootfs, + uvm_plain_any, cpu_template_helper, tmp_path, ): @@ -282,7 +280,7 @@ def test_cpu_config_dump_vs_actual( dump_cpu_config = build_cpu_config_dict(cpu_config_path) # Retrieve actual CPU config from guest - microvm = microvm_factory.build(guest_kernel, rootfs) + microvm = uvm_plain_any microvm.spawn() microvm.basic_config(vcpu_count=1) microvm.add_net_iface() diff --git a/tests/integration_tests/functional/test_feat_parity.py b/tests/integration_tests/functional/test_feat_parity.py index 1eadbc6d29c..9fc89ffcd2c 100644 --- a/tests/integration_tests/functional/test_feat_parity.py +++ b/tests/integration_tests/functional/test_feat_parity.py @@ -28,16 +28,11 @@ def inst_set_cpu_template_fxt(request): @pytest.fixture(name="vm") -def vm_fxt( - microvm_factory, - inst_set_cpu_template, - guest_kernel, - rootfs, -): +def vm_fxt(uvm_plain_any, inst_set_cpu_template): """ Create a VM, using the normal CPU templates """ - vm = microvm_factory.build(guest_kernel, rootfs) + vm = uvm_plain_any vm.spawn() vm.basic_config(vcpu_count=1, mem_size_mib=1024, cpu_template=inst_set_cpu_template) vm.add_net_iface() diff --git a/tests/integration_tests/functional/test_snapshot_basic.py b/tests/integration_tests/functional/test_snapshot_basic.py index 2b786ea16ae..cbf2c718d55 100644 --- a/tests/integration_tests/functional/test_snapshot_basic.py +++ b/tests/integration_tests/functional/test_snapshot_basic.py @@ -115,9 +115,8 @@ def test_snapshot_current_version(uvm_nano): def test_cycled_snapshot_restore( bin_vsock_path, tmp_path, + uvm_plain_any, microvm_factory, - guest_kernel, - rootfs, snapshot_type, use_snapshot_editor, cpu_template_any, @@ -132,7 +131,7 @@ def test_cycled_snapshot_restore( logger = logging.getLogger("snapshot_sequence") - vm = microvm_factory.build(guest_kernel, rootfs) + vm = uvm_plain_any vm.spawn() vm.basic_config( vcpu_count=2, @@ -248,7 +247,7 @@ def test_load_snapshot_failure_handling(uvm_plain): vm.mark_killed() -def test_cmp_full_and_first_diff_mem(microvm_factory, guest_kernel, rootfs): +def test_cmp_full_and_first_diff_mem(uvm_plain_any): """ Compare memory of 2 consecutive full and diff snapshots. @@ -259,7 +258,7 @@ def test_cmp_full_and_first_diff_mem(microvm_factory, guest_kernel, rootfs): """ logger = logging.getLogger("snapshot_sequence") - vm = microvm_factory.build(guest_kernel, rootfs) + vm = uvm_plain_any vm.spawn() vm.basic_config( vcpu_count=2, @@ -414,12 +413,12 @@ def test_create_large_diff_snapshot(uvm_plain): # process would have been taken down. -def test_diff_snapshot_overlay(guest_kernel, rootfs, microvm_factory): +def test_diff_snapshot_overlay(uvm_plain_any, microvm_factory): """ Tests that if we take a diff snapshot and direct firecracker to write it on top of an existing snapshot file, it will successfully merge them. """ - basevm = microvm_factory.build(guest_kernel, rootfs) + basevm = uvm_plain_any basevm.spawn() basevm.basic_config(track_dirty_pages=True) basevm.add_net_iface() @@ -451,7 +450,7 @@ def test_diff_snapshot_overlay(guest_kernel, rootfs, microvm_factory): # Check that the restored VM works -def test_snapshot_overwrite_self(guest_kernel, rootfs, microvm_factory): +def test_snapshot_overwrite_self(uvm_plain_any, microvm_factory): """Tests that if we try to take a snapshot that would overwrite the very file from which the current VM is stored, nothing happens. @@ -459,7 +458,7 @@ def test_snapshot_overwrite_self(guest_kernel, rootfs, microvm_factory): of mmap does not specify what should happen if the file is changed after being mmap'd (https://man7.org/linux/man-pages/man2/mmap.2.html). It seems that these changes can propagate to the mmap'd memory region.""" - base_vm = microvm_factory.build(guest_kernel, rootfs) + base_vm = uvm_plain_any base_vm.spawn() base_vm.basic_config() base_vm.add_net_iface() diff --git a/tests/integration_tests/performance/test_huge_pages.py b/tests/integration_tests/performance/test_huge_pages.py index 04a9264977a..1c5a14873d1 100644 --- a/tests/integration_tests/performance/test_huge_pages.py +++ b/tests/integration_tests/performance/test_huge_pages.py @@ -68,9 +68,7 @@ def test_hugetlbfs_boot(uvm_plain): ) -def test_hugetlbfs_snapshot( - microvm_factory, guest_kernel_linux_5_10, rootfs, snapshot_type -): +def test_hugetlbfs_snapshot(microvm_factory, uvm_plain, snapshot_type): """ Test hugetlbfs snapshot restore via uffd @@ -79,7 +77,7 @@ def test_hugetlbfs_snapshot( """ ### Create Snapshot ### - vm = microvm_factory.build(guest_kernel_linux_5_10, rootfs) + vm = uvm_plain vm.memory_monitor = None vm.spawn() vm.basic_config( @@ -107,8 +105,7 @@ def test_hugetlbfs_snapshot( @pytest.mark.parametrize("huge_pages", HugePagesConfig) def test_ept_violation_count( microvm_factory, - guest_kernel_linux_5_10, - rootfs, + uvm_plain, metrics, huge_pages, ): @@ -118,7 +115,7 @@ def test_ept_violation_count( """ ### Create Snapshot ### - vm = microvm_factory.build(guest_kernel_linux_5_10, rootfs) + vm = uvm_plain vm.memory_monitor = None vm.spawn() vm.basic_config(huge_pages=huge_pages, mem_size_mib=256) diff --git a/tests/integration_tests/performance/test_snapshot.py b/tests/integration_tests/performance/test_snapshot.py index 24ca4ab974b..d8bd8b91c70 100644 --- a/tests/integration_tests/performance/test_snapshot.py +++ b/tests/integration_tests/performance/test_snapshot.py @@ -255,15 +255,13 @@ def test_population_latency( @pytest.mark.nonci def test_snapshot_create_latency( - microvm_factory, - guest_kernel_linux_5_10, - rootfs, + uvm_plain, metrics, snapshot_type, ): """Measure the latency of creating a Full snapshot""" - vm = microvm_factory.build(guest_kernel_linux_5_10, rootfs, monitor_memory=False) + vm = uvm_plain vm.spawn() vm.basic_config( vcpu_count=2, From d176d97430fbc17080fbbf2b2e9698f71e495100 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Tue, 15 Jul 2025 15:59:16 +0100 Subject: [PATCH 68/99] refactor(test): add uvm_plain_acpi and _6_1 fixtures Simplify the test code by introducing two new fixtures that are used in a few places in the code. This will also allow these tests to run on PCI. Signed-off-by: Riccardo Mancini Signed-off-by: Babis Chalios --- tests/conftest.py | 12 ++++++++++++ .../functional/test_snapshot_basic.py | 4 ++-- tests/integration_tests/performance/test_block.py | 15 ++++----------- .../integration_tests/performance/test_network.py | 6 ++---- .../performance/test_vhost_user_metrics.py | 6 ++---- tests/integration_tests/performance/test_vsock.py | 9 ++------- 6 files changed, 24 insertions(+), 28 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 369f164eec7..96ee285d192 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -514,6 +514,18 @@ def uvm_plain(microvm_factory, guest_kernel_linux_5_10, rootfs, pci_enabled): return microvm_factory.build(guest_kernel_linux_5_10, rootfs, pci=pci_enabled) +@pytest.fixture +def uvm_plain_6_1(microvm_factory, guest_kernel_linux_6_1, rootfs, pci_enabled): + """Create a vanilla VM, non-parametrized""" + return microvm_factory.build(guest_kernel_linux_6_1, rootfs, pci=pci_enabled) + + +@pytest.fixture +def uvm_plain_acpi(microvm_factory, guest_kernel_acpi, rootfs, pci_enabled): + """Create a vanilla VM, non-parametrized""" + return microvm_factory.build(guest_kernel_acpi, rootfs, pci=pci_enabled) + + @pytest.fixture def uvm_plain_rw(microvm_factory, guest_kernel_linux_5_10, rootfs_rw): """Create a vanilla VM, non-parametrized""" diff --git a/tests/integration_tests/functional/test_snapshot_basic.py b/tests/integration_tests/functional/test_snapshot_basic.py index cbf2c718d55..c4eac866028 100644 --- a/tests/integration_tests/functional/test_snapshot_basic.py +++ b/tests/integration_tests/functional/test_snapshot_basic.py @@ -482,11 +482,11 @@ def test_snapshot_overwrite_self(uvm_plain_any, microvm_factory): # restored, with a new snapshot of this vm, does not break the VM -def test_vmgenid(guest_kernel_linux_6_1, rootfs, microvm_factory, snapshot_type): +def test_vmgenid(uvm_plain_6_1, microvm_factory, snapshot_type): """ Test VMGenID device upon snapshot resume """ - base_vm = microvm_factory.build(guest_kernel_linux_6_1, rootfs) + base_vm = uvm_plain_6_1 base_vm.spawn() base_vm.basic_config(track_dirty_pages=True) base_vm.add_net_iface() diff --git a/tests/integration_tests/performance/test_block.py b/tests/integration_tests/performance/test_block.py index d88d2186ddf..7fdd9576a3a 100644 --- a/tests/integration_tests/performance/test_block.py +++ b/tests/integration_tests/performance/test_block.py @@ -161,14 +161,11 @@ def emit_fio_metrics(logs_dir, metrics): @pytest.mark.parametrize("fio_block_size", [4096], ids=["bs4096"]) @pytest.mark.parametrize("fio_engine", ["libaio", "psync"]) def test_block_performance( - microvm_factory, - guest_kernel_acpi, - rootfs, + uvm_any_acpi, vcpus, fio_mode, fio_block_size, fio_engine, - pci_enabled, io_engine, metrics, results_dir, @@ -176,9 +173,7 @@ def test_block_performance( """ Execute block device emulation benchmarking scenarios. """ - vm = microvm_factory.build( - guest_kernel_acpi, rootfs, monitor_memory=False, pci=pci_enabled - ) + vm = uvm_any_acpi vm.spawn(log_level="Info", emit_metrics=True) vm.basic_config(vcpu_count=vcpus, mem_size_mib=GUEST_MEM_MIB) vm.add_net_iface() @@ -216,9 +211,7 @@ def test_block_performance( @pytest.mark.parametrize("fio_mode", ["randread"]) @pytest.mark.parametrize("fio_block_size", [4096], ids=["bs4096"]) def test_block_vhost_user_performance( - microvm_factory, - guest_kernel_acpi, - rootfs, + uvm_any_acpi, vcpus, fio_mode, fio_block_size, @@ -229,7 +222,7 @@ def test_block_vhost_user_performance( Execute block device emulation benchmarking scenarios. """ - vm = microvm_factory.build(guest_kernel_acpi, rootfs, monitor_memory=False) + vm = uvm_any_acpi vm.spawn(log_level="Info", emit_metrics=True) vm.basic_config(vcpu_count=vcpus, mem_size_mib=GUEST_MEM_MIB) vm.add_net_iface() diff --git a/tests/integration_tests/performance/test_network.py b/tests/integration_tests/performance/test_network.py index cd2ab90dff3..74ad26c26a8 100644 --- a/tests/integration_tests/performance/test_network.py +++ b/tests/integration_tests/performance/test_network.py @@ -38,16 +38,14 @@ def consume_ping_output(ping_putput): @pytest.fixture -def network_microvm(request, microvm_factory, guest_kernel_acpi, rootfs, pci_enabled): +def network_microvm(request, uvm_plain_acpi): """Creates a microvm with the networking setup used by the performance tests in this file. This fixture receives its vcpu count via indirect parameterization""" guest_mem_mib = 1024 guest_vcpus = request.param - vm = microvm_factory.build( - guest_kernel_acpi, rootfs, monitor_memory=False, pci=pci_enabled - ) + vm = uvm_plain_acpi vm.spawn(log_level="Info", emit_metrics=True) vm.basic_config(vcpu_count=guest_vcpus, mem_size_mib=guest_mem_mib) vm.add_net_iface() diff --git a/tests/integration_tests/performance/test_vhost_user_metrics.py b/tests/integration_tests/performance/test_vhost_user_metrics.py index fd20b34a47b..a278ae79971 100644 --- a/tests/integration_tests/performance/test_vhost_user_metrics.py +++ b/tests/integration_tests/performance/test_vhost_user_metrics.py @@ -10,9 +10,7 @@ @pytest.mark.parametrize("vcpu_count", [1, 2], ids=["1vcpu", "2vcpu"]) -def test_vhost_user_block_metrics( - microvm_factory, guest_kernel_acpi, rootfs, vcpu_count, metrics -): +def test_vhost_user_block_metrics(uvm_plain_acpi, vcpu_count, metrics): """ This test tries to boot a VM with vhost-user-block as a scratch device, resize the vhost-user scratch drive to have @@ -28,7 +26,7 @@ def test_vhost_user_block_metrics( # low->high->low->high and so the numbers are not in monotonic sequence. new_sizes = [20, 10, 30] # MB - vm = microvm_factory.build(guest_kernel_acpi, rootfs, monitor_memory=False) + vm = uvm_plain_acpi vm.spawn(log_level="Info") vm.basic_config(vcpu_count=vcpu_count) vm.add_net_iface() diff --git a/tests/integration_tests/performance/test_vsock.py b/tests/integration_tests/performance/test_vsock.py index 076e0b41da0..402e7ff66b5 100644 --- a/tests/integration_tests/performance/test_vsock.py +++ b/tests/integration_tests/performance/test_vsock.py @@ -75,12 +75,9 @@ def guest_command(self, port_offset): @pytest.mark.parametrize("payload_length", ["64K", "1024K"], ids=["p64K", "p1024K"]) @pytest.mark.parametrize("mode", ["g2h", "h2g", "bd"]) def test_vsock_throughput( - microvm_factory, - guest_kernel_acpi, - rootfs, + uvm_plain_acpi, vcpus, payload_length, - pci_enabled, mode, metrics, results_dir, @@ -95,9 +92,7 @@ def test_vsock_throughput( pytest.skip("bidrectional test only done with at least 2 vcpus") mem_size_mib = 1024 - vm = microvm_factory.build( - guest_kernel_acpi, rootfs, monitor_memory=False, pci=pci_enabled - ) + vm = uvm_plain_acpi vm.spawn(log_level="Info", emit_metrics=True) vm.basic_config(vcpu_count=vcpus, mem_size_mib=mem_size_mib) vm.add_net_iface() From d8a909c21b3f2511d29174707df02d77589243d3 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Tue, 15 Jul 2025 16:00:30 +0100 Subject: [PATCH 69/99] test(concurrency): run also with PCI Run the test_run_concurrency with PCI enabled as well. Signed-off-by: Riccardo Mancini Signed-off-by: Babis Chalios --- tests/integration_tests/functional/test_concurrency.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration_tests/functional/test_concurrency.py b/tests/integration_tests/functional/test_concurrency.py index e4756729f2b..15394ec6ada 100644 --- a/tests/integration_tests/functional/test_concurrency.py +++ b/tests/integration_tests/functional/test_concurrency.py @@ -7,13 +7,13 @@ NO_OF_MICROVMS = 20 -def test_run_concurrency(microvm_factory, guest_kernel, rootfs): +def test_run_concurrency(microvm_factory, guest_kernel, rootfs, pci_enabled): """ Check we can spawn multiple microvms. """ def launch1(): - microvm = microvm_factory.build(guest_kernel, rootfs) + microvm = microvm_factory.build(guest_kernel, rootfs, pci=pci_enabled) microvm.time_api_requests = False # is flaky because of parallelism microvm.spawn() microvm.basic_config(vcpu_count=1, mem_size_mib=128) From 831b2dd56662241dfc8dffaae94263de666fac10 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Tue, 15 Jul 2025 16:00:56 +0100 Subject: [PATCH 70/99] test(vhost): run tests also with PCI enabled Refactor the code to use common fixtures and run all the tests with PCI enabled as well. Signed-off-by: Riccardo Mancini Signed-off-by: Babis Chalios --- .../functional/test_drive_vhost_user.py | 118 ++++++++++-------- 1 file changed, 65 insertions(+), 53 deletions(-) diff --git a/tests/integration_tests/functional/test_drive_vhost_user.py b/tests/integration_tests/functional/test_drive_vhost_user.py index 79cc41b0f3a..07fcafb715e 100644 --- a/tests/integration_tests/functional/test_drive_vhost_user.py +++ b/tests/integration_tests/functional/test_drive_vhost_user.py @@ -6,11 +6,62 @@ import shutil from pathlib import Path +import pytest + import host_tools.drive as drive_tools from framework.utils_drive import partuuid_and_disk_path from host_tools.fcmetrics import FcDeviceMetrics +@pytest.fixture +def uvm_vhost_user_plain_any(microvm_factory, guest_kernel, pci_enabled): + """Builds a plain VM with no root volume""" + return microvm_factory.build( + guest_kernel, None, pci=pci_enabled, monitor_memory=False + ) + + +@pytest.fixture +def uvm_vhost_user_booted_ro(uvm_vhost_user_plain_any, rootfs): + """Returns a VM with a vhost-user rootfs""" + vm = uvm_vhost_user_plain_any + + # We need to setup ssh keys manually because we did not specify rootfs + # in microvm_factory.build method + ssh_key = rootfs.with_suffix(".id_rsa") + vm.ssh_key = ssh_key + vm.spawn() + vm.basic_config(add_root_device=False) + vm.add_vhost_user_drive("rootfs", rootfs, is_root_device=True, is_read_only=True) + vm.add_net_iface() + vm.start() + + return vm + + +@pytest.fixture +def uvm_vhost_user_booted_rw(uvm_vhost_user_plain_any, rootfs): + """Returns a VM with a vhost-user rootfs""" + vm = uvm_vhost_user_plain_any + + # We need to setup ssh keys manually because we did not specify rootfs + # in microvm_factory.build method + ssh_key = rootfs.with_suffix(".id_rsa") + vm.ssh_key = ssh_key + vm.spawn() + vm.basic_config(add_root_device=False) + # Create a rw rootfs file that is unique to the microVM + rootfs_rw = Path(vm.chroot()) / "rootfs" + shutil.copy(rootfs, rootfs_rw) + vm.add_vhost_user_drive( + "rootfs", rootfs_rw, is_root_device=True, is_read_only=False + ) + vm.add_net_iface() + vm.start() + + return vm + + def _check_block_size(ssh_connection, dev_path, size): """ Checks the size of the block device. @@ -34,26 +85,16 @@ def _check_drives(test_microvm, assert_dict, keys_array): assert blockdev_out_line_cols[col] == assert_dict[key] -def test_vhost_user_block(microvm_factory, guest_kernel, rootfs): +def test_vhost_user_block(uvm_vhost_user_booted_ro): """ This test simply tries to boot a VM with vhost-user-block as a root device. """ - vm = microvm_factory.build(guest_kernel, None, monitor_memory=False) - - # We need to setup ssh keys manually because we did not specify rootfs - # in microvm_factory.build method - ssh_key = rootfs.with_suffix(".id_rsa") - vm.ssh_key = ssh_key - vm.spawn() - vm.basic_config(add_root_device=False) - vm.add_vhost_user_drive("rootfs", rootfs, is_root_device=True, is_read_only=True) - vm.add_net_iface() + vm = uvm_vhost_user_booted_ro vhost_user_block_metrics = FcDeviceMetrics( "vhost_user_block", 1, aggr_supported=False ) - vm.start() # Now check that vhost-user-block with rw is last. # 1-0 means line 1, column 0. @@ -65,29 +106,14 @@ def test_vhost_user_block(microvm_factory, guest_kernel, rootfs): vhost_user_block_metrics.validate(vm) -def test_vhost_user_block_read_write(microvm_factory, guest_kernel, rootfs): +def test_vhost_user_block_read_write(uvm_vhost_user_booted_rw): """ This test simply tries to boot a VM with vhost-user-block as a root device. This test configures vhost-user-block to be read write. """ - vm = microvm_factory.build(guest_kernel, None, monitor_memory=False) - - # We need to setup ssh keys manually because we did not specify rootfs - # in microvm_factory.build method - ssh_key = rootfs.with_suffix(".id_rsa") - vm.ssh_key = ssh_key - vm.spawn() - vm.basic_config(add_root_device=False) - - # Create a rw rootfs file that is unique to the microVM - rootfs_rw = Path(vm.chroot()) / "rootfs" - shutil.copy(rootfs, rootfs_rw) - - vm.add_vhost_user_drive("rootfs", rootfs_rw, is_root_device=True) - vm.add_net_iface() - vm.start() + vm = uvm_vhost_user_booted_rw # Now check that vhost-user-block with rw is last. # 1-0 means line 1, column 0. @@ -98,22 +124,12 @@ def test_vhost_user_block_read_write(microvm_factory, guest_kernel, rootfs): _check_drives(vm, assert_dict, assert_dict.keys()) -def test_vhost_user_block_disconnect(microvm_factory, guest_kernel, rootfs): +def test_vhost_user_block_disconnect(uvm_vhost_user_booted_ro): """ Test that even if backend is killed, Firecracker is still responsive. """ - vm = microvm_factory.build(guest_kernel, None, monitor_memory=False) - - # We need to set up ssh keys manually because we did not specify rootfs - # in microvm_factory.build method - ssh_key = rootfs.with_suffix(".id_rsa") - vm.ssh_key = ssh_key - vm.spawn() - vm.basic_config(add_root_device=False) - vm.add_vhost_user_drive("rootfs", rootfs, is_root_device=True, is_read_only=True) - vm.add_net_iface() - vm.start() + vm = uvm_vhost_user_booted_ro # Killing the backend vm.disks_vhost_user["rootfs"].kill() @@ -123,7 +139,7 @@ def test_vhost_user_block_disconnect(microvm_factory, guest_kernel, rootfs): _config = vm.api.vm_config.get().json() -def test_device_ordering(microvm_factory, guest_kernel, rootfs): +def test_device_ordering(uvm_vhost_user_plain_any, rootfs): """ Verify device ordering. @@ -131,7 +147,7 @@ def test_device_ordering(microvm_factory, guest_kernel, rootfs): the order of the other devices should match their configuration order. """ - vm = microvm_factory.build(guest_kernel, None, monitor_memory=False) + vm = uvm_vhost_user_plain_any # We need to setup ssh keys manually because we did not specify rootfs # in microvm_factory.build method @@ -194,16 +210,12 @@ def test_device_ordering(microvm_factory, guest_kernel, rootfs): vhost_user_block_metrics.validate(vm) -def test_partuuid_boot( - microvm_factory, - guest_kernel, - rootfs, -): +def test_partuuid_boot(uvm_vhost_user_plain_any, rootfs): """ Test the output reported by blockdev when booting with PARTUUID. """ - vm = microvm_factory.build(guest_kernel, None, monitor_memory=False) + vm = uvm_vhost_user_plain_any # We need to setup ssh keys manually because we did not specify rootfs # in microvm_factory.build method @@ -230,12 +242,12 @@ def test_partuuid_boot( _check_drives(vm, assert_dict, assert_dict.keys()) -def test_partuuid_update(microvm_factory, guest_kernel, rootfs): +def test_partuuid_update(uvm_vhost_user_plain_any, rootfs): """ Test successful switching from PARTUUID boot to /dev/vda boot. """ - vm = microvm_factory.build(guest_kernel, None, monitor_memory=False) + vm = uvm_vhost_user_plain_any # We need to setup ssh keys manually because we did not specify rootfs # in microvm_factory.build method @@ -272,7 +284,7 @@ def test_partuuid_update(microvm_factory, guest_kernel, rootfs): vhost_user_block_metrics.validate(vm) -def test_config_change(microvm_factory, guest_kernel, rootfs): +def test_config_change(uvm_plain_any): """ Verify handling of block device resize. We expect that the guest will start reporting the updated size @@ -283,7 +295,7 @@ def test_config_change(microvm_factory, guest_kernel, rootfs): new_sizes = [20, 10, 30] # MB mkfs_mount_cmd = "mkfs.ext4 /dev/vdb && mkdir -p /tmp/tmp && mount /dev/vdb /tmp/tmp && umount /tmp/tmp" - vm = microvm_factory.build(guest_kernel, rootfs, monitor_memory=False) + vm = uvm_plain_any vm.spawn(log_level="Info") vm.basic_config() vm.add_net_iface() From 49cfaf68c4572a3834ca2ed325a79ac5e1b2f57c Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Tue, 15 Jul 2025 16:01:32 +0100 Subject: [PATCH 71/99] test(initrd): run also with PCI enabled Run the initrd tests also with PCI enabled to verify everything is still working correctly. Signed-off-by: Riccardo Mancini Signed-off-by: Babis Chalios --- tests/integration_tests/performance/test_initrd.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/integration_tests/performance/test_initrd.py b/tests/integration_tests/performance/test_initrd.py index 0db8578a5ef..6cf133e373c 100644 --- a/tests/integration_tests/performance/test_initrd.py +++ b/tests/integration_tests/performance/test_initrd.py @@ -9,13 +9,15 @@ @pytest.fixture -def uvm_with_initrd(microvm_factory, guest_kernel, record_property, artifact_dir): +def uvm_with_initrd( + microvm_factory, guest_kernel, pci_enabled, record_property, artifact_dir +): """ See file:../docs/initrd.md """ fs = artifact_dir / "initramfs.cpio" record_property("rootfs", fs.name) - uvm = microvm_factory.build(guest_kernel) + uvm = microvm_factory.build(guest_kernel, pci=pci_enabled) uvm.initrd_file = fs yield uvm From 71fa49bdbd7807eb52e60740bf3a26e158bc551c Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Tue, 15 Jul 2025 16:02:14 +0100 Subject: [PATCH 72/99] test(memory-overhead): run also with PCI enabled Run test_memory_overhead performance test also with PCI enabled. Signed-off-by: Riccardo Mancini Signed-off-by: Babis Chalios --- .../performance/test_memory_overhead.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/integration_tests/performance/test_memory_overhead.py b/tests/integration_tests/performance/test_memory_overhead.py index 7935397cff4..2f4888c95ea 100644 --- a/tests/integration_tests/performance/test_memory_overhead.py +++ b/tests/integration_tests/performance/test_memory_overhead.py @@ -30,7 +30,13 @@ ) @pytest.mark.nonci def test_memory_overhead( - microvm_factory, guest_kernel_acpi, rootfs, vcpu_count, mem_size_mib, metrics + microvm_factory, + guest_kernel_acpi, + rootfs, + vcpu_count, + mem_size_mib, + pci_enabled, + metrics, ): """Track Firecracker memory overhead. @@ -38,7 +44,9 @@ def test_memory_overhead( """ for _ in range(5): - microvm = microvm_factory.build(guest_kernel_acpi, rootfs, monitor_memory=False) + microvm = microvm_factory.build( + guest_kernel_acpi, rootfs, pci=pci_enabled, monitor_memory=False + ) microvm.spawn(emit_metrics=True) microvm.basic_config(vcpu_count=vcpu_count, mem_size_mib=mem_size_mib) microvm.add_net_iface() From 27b3fad3fd5b11053ca6cb20477743c4e00a672a Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Tue, 15 Jul 2025 16:02:46 +0100 Subject: [PATCH 73/99] test(perf/snapshot): run also with PCI enabled Run the restore latency tests also with PCI enabled to verify there is no change. Signed-off-by: Riccardo Mancini Signed-off-by: Babis Chalios --- .../performance/test_snapshot.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/tests/integration_tests/performance/test_snapshot.py b/tests/integration_tests/performance/test_snapshot.py index d8bd8b91c70..b4e9afabb67 100644 --- a/tests/integration_tests/performance/test_snapshot.py +++ b/tests/integration_tests/performance/test_snapshot.py @@ -44,12 +44,13 @@ def id(self): """Computes a unique id for this test instance""" return "all_dev" if self.all_devices else f"{self.vcpus}vcpu_{self.mem}mb" - def boot_vm(self, microvm_factory, guest_kernel, rootfs) -> Microvm: + def boot_vm(self, microvm_factory, guest_kernel, rootfs, pci_enabled) -> Microvm: """Creates the initial snapshot that will be loaded repeatedly to sample latencies""" vm = microvm_factory.build( guest_kernel, rootfs, monitor_memory=False, + pci=pci_enabled, ) vm.spawn(log_level="Info", emit_metrics=True) vm.time_api_requests = False @@ -96,7 +97,7 @@ def boot_vm(self, microvm_factory, guest_kernel, rootfs) -> Microvm: ids=lambda x: x.id, ) def test_restore_latency( - microvm_factory, rootfs, guest_kernel_linux_5_10, test_setup, metrics + microvm_factory, guest_kernel_linux_5_10, rootfs, pci_enabled, test_setup, metrics ): """ Restores snapshots with vcpu/memory configuration, roughly scaling according to mem = (vcpus - 1) * 2048MB, @@ -105,7 +106,9 @@ def test_restore_latency( We only test a single guest kernel, as the guest kernel does not "participate" in snapshot restore. """ - vm = test_setup.boot_vm(microvm_factory, guest_kernel_linux_5_10, rootfs) + vm = test_setup.boot_vm( + microvm_factory, guest_kernel_linux_5_10, rootfs, pci_enabled + ) metrics.set_dimensions( { @@ -147,6 +150,7 @@ def test_post_restore_latency( microvm_factory, rootfs, guest_kernel_linux_5_10, + pci_enabled, metrics, uffd_handler, huge_pages, @@ -156,7 +160,9 @@ def test_post_restore_latency( pytest.skip("huge page snapshots can only be restored using uffd") test_setup = SnapshotRestoreTest(mem=1024, vcpus=2, huge_pages=huge_pages) - vm = test_setup.boot_vm(microvm_factory, guest_kernel_linux_5_10, rootfs) + vm = test_setup.boot_vm( + microvm_factory, guest_kernel_linux_5_10, rootfs, pci_enabled + ) metrics.set_dimensions( { @@ -204,6 +210,7 @@ def test_population_latency( microvm_factory, rootfs, guest_kernel_linux_5_10, + pci_enabled, metrics, huge_pages, vcpus, @@ -211,7 +218,9 @@ def test_population_latency( ): """Collects population latency metrics (e.g. how long it takes UFFD handler to fault in all memory)""" test_setup = SnapshotRestoreTest(mem=mem, vcpus=vcpus, huge_pages=huge_pages) - vm = test_setup.boot_vm(microvm_factory, guest_kernel_linux_5_10, rootfs) + vm = test_setup.boot_vm( + microvm_factory, guest_kernel_linux_5_10, rootfs, pci_enabled + ) metrics.set_dimensions( { From 25bd4e3952318e726406a82592216ccc5dc6a780 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Fri, 18 Jul 2025 10:12:57 +0100 Subject: [PATCH 74/99] fix(test_block): correct fixture name s/uvm_any_acpi/uvm_plain_acpi/ Fix the block tests by using the correct fixture name. Fixes: eb7248f ("refactor(test): add uvm_plain_acpi and _6_1 fixtures") Signed-off-by: Riccardo Mancini Signed-off-by: Babis Chalios --- tests/integration_tests/performance/test_block.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/integration_tests/performance/test_block.py b/tests/integration_tests/performance/test_block.py index 7fdd9576a3a..8882ee0717c 100644 --- a/tests/integration_tests/performance/test_block.py +++ b/tests/integration_tests/performance/test_block.py @@ -161,7 +161,7 @@ def emit_fio_metrics(logs_dir, metrics): @pytest.mark.parametrize("fio_block_size", [4096], ids=["bs4096"]) @pytest.mark.parametrize("fio_engine", ["libaio", "psync"]) def test_block_performance( - uvm_any_acpi, + uvm_plain_acpi, vcpus, fio_mode, fio_block_size, @@ -173,7 +173,7 @@ def test_block_performance( """ Execute block device emulation benchmarking scenarios. """ - vm = uvm_any_acpi + vm = uvm_plain_acpi vm.spawn(log_level="Info", emit_metrics=True) vm.basic_config(vcpu_count=vcpus, mem_size_mib=GUEST_MEM_MIB) vm.add_net_iface() @@ -211,7 +211,7 @@ def test_block_performance( @pytest.mark.parametrize("fio_mode", ["randread"]) @pytest.mark.parametrize("fio_block_size", [4096], ids=["bs4096"]) def test_block_vhost_user_performance( - uvm_any_acpi, + uvm_plain_acpi, vcpus, fio_mode, fio_block_size, @@ -222,7 +222,7 @@ def test_block_vhost_user_performance( Execute block device emulation benchmarking scenarios. """ - vm = uvm_any_acpi + vm = uvm_plain_acpi vm.spawn(log_level="Info", emit_metrics=True) vm.basic_config(vcpu_count=vcpus, mem_size_mib=GUEST_MEM_MIB) vm.add_net_iface() From 0979f94ab2e40e49c10993df35f152cc82f898d6 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 29 Jul 2025 14:42:02 +0200 Subject: [PATCH 75/99] fix: check in Cargo.lock changes Probably missed during a rebase of `feature/pcie` on top of `main` branch. Signed-off-by: Babis Chalios --- Cargo.lock | 9 --------- 1 file changed, 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9fcde886d54..339ab721674 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -761,15 +761,6 @@ version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" -[[package]] -name = "itertools" -version = "0.10.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" -dependencies = [ - "either", -] - [[package]] name = "itertools" version = "0.12.1" From 53dec53b090b0ac7580ed269fb4880ef830e716d Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Mon, 28 Jul 2025 13:40:39 +0200 Subject: [PATCH 76/99] msix: relax assertion on data accesses from guest It is true that writes/reads of an MSI-X table are either 32 or 64 bits long. However, we do check for this invariant in the `match` expression just after the assertion. If the invariant is not held (the guest tried to read/write with an invalid length) we just print an error and continue. This branch of the `match` block is never reached due to the assertion itself. To simplify things, just remove the assertion and let the `match` block logic handle invalid memory accesses. This should also help us better fuzz the bus accesses. Do add a check that the data access is up to 8 bytes long. These are all MMIO or Port IO accesses and they can't be bigger than 8 bytes. So this assertion should never fail in production (unless there's a KVM bug or we try to run Firecracker in some architecture that allows more than 64bit memory accesses). Signed-off-by: Babis Chalios --- src/pci/src/msix.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pci/src/msix.rs b/src/pci/src/msix.rs index be5aa3b8cf1..82f851322b4 100644 --- a/src/pci/src/msix.rs +++ b/src/pci/src/msix.rs @@ -219,7 +219,7 @@ impl MsixConfig { } pub fn read_table(&self, offset: u64, data: &mut [u8]) { - assert!((data.len() == 4 || data.len() == 8)); + assert!(data.len() <= 8); let index: usize = (offset / MSIX_TABLE_ENTRIES_MODULO) as usize; let modulo_offset = offset % MSIX_TABLE_ENTRIES_MODULO; @@ -272,7 +272,7 @@ impl MsixConfig { } pub fn write_table(&mut self, offset: u64, data: &[u8]) { - assert!((data.len() == 4 || data.len() == 8)); + assert!(data.len() <= 8); let index: usize = (offset / MSIX_TABLE_ENTRIES_MODULO) as usize; let modulo_offset = offset % MSIX_TABLE_ENTRIES_MODULO; @@ -368,7 +368,7 @@ impl MsixConfig { } pub fn read_pba(&mut self, offset: u64, data: &mut [u8]) { - assert!((data.len() == 4 || data.len() == 8)); + assert!(data.len() <= 8); let index: usize = (offset / MSIX_PBA_ENTRIES_MODULO) as usize; let modulo_offset = offset % MSIX_PBA_ENTRIES_MODULO; From d54bdb618a5bfb77f7117d0c558c8e276eacffcc Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Mon, 28 Jul 2025 17:35:47 +0200 Subject: [PATCH 77/99] msi: use Vec to store GSIs for MSI vectors We were using a Hashmap to store the GSIs that were used by the vectors of an MSI-X group. These vectors were always indexed starting by 0, so we can just use a simple Vec. Signed-off-by: Babis Chalios --- .../devices/virtio/transport/pci/device.rs | 2 +- src/vmm/src/vstate/vm.rs | 44 +++++++++---------- 2 files changed, 22 insertions(+), 24 deletions(-) diff --git a/src/vmm/src/devices/virtio/transport/pci/device.rs b/src/vmm/src/devices/virtio/transport/pci/device.rs index 384ad0358dd..7ee580fc6a1 100644 --- a/src/vmm/src/devices/virtio/transport/pci/device.rs +++ b/src/vmm/src/devices/virtio/transport/pci/device.rs @@ -305,7 +305,7 @@ pub struct VirtioPciDeviceState { pub pci_configuration_state: PciConfigurationState, pub pci_dev_state: VirtioPciCommonConfigState, pub msix_state: MsixConfigState, - pub msi_vector_group: HashMap, + pub msi_vector_group: Vec, pub bar_configuration: Vec, } diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index f4a18484cbc..8d037db96fa 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -110,7 +110,7 @@ impl MsiVector { /// MSI interrupts created for a VirtIO device pub struct MsiVectorGroup { vm: Arc, - irq_routes: HashMap, + irq_routes: Vec, } impl MsiVectorGroup { @@ -123,7 +123,7 @@ impl MsiVectorGroup { } impl<'a> Persist<'a> for MsiVectorGroup { - type State = HashMap; + type State = Vec; type ConstructorArgs = Arc; type Error = InterruptError; @@ -131,20 +131,17 @@ impl<'a> Persist<'a> for MsiVectorGroup { // We don't save the "enabled" state of the MSI interrupt. PCI devices store the MSI-X // configuration and make sure that the vector is enabled during the restore path if it was // initially enabled - self.irq_routes - .iter() - .map(|(id, route)| (*id, route.gsi)) - .collect() + self.irq_routes.iter().map(|route| route.gsi).collect() } fn restore( constructor_args: Self::ConstructorArgs, state: &Self::State, ) -> std::result::Result { - let mut irq_routes = HashMap::new(); + let mut irq_routes = Vec::with_capacity(state.len()); - for (id, gsi) in state { - irq_routes.insert(*id, MsiVector::new(*gsi, false)?); + for gsi in state { + irq_routes.push(MsiVector::new(*gsi, false)?); } Ok(MsiVectorGroup { @@ -156,7 +153,7 @@ impl<'a> Persist<'a> for MsiVectorGroup { impl InterruptSourceGroup for MsiVectorGroup { fn enable(&self) -> vm_device::interrupt::Result<()> { - for route in self.irq_routes.values() { + for route in &self.irq_routes { route.enable(&self.vm.common.fd)?; } @@ -164,7 +161,7 @@ impl InterruptSourceGroup for MsiVectorGroup { } fn disable(&self) -> vm_device::interrupt::Result<()> { - for route in self.irq_routes.values() { + for route in &self.irq_routes { route.disable(&self.vm.common.fd)?; } @@ -180,7 +177,9 @@ impl InterruptSourceGroup for MsiVectorGroup { } fn notifier(&self, index: InterruptIndex) -> Option<&EventFd> { - self.irq_routes.get(&index).map(|route| &route.event_fd) + self.irq_routes + .get(index as usize) + .map(|route| &route.event_fd) } fn update( @@ -199,7 +198,7 @@ impl InterruptSourceGroup for MsiVectorGroup { InterruptSourceConfig::MsiIrq(config) => config, }; - if let Some(route) = self.irq_routes.get(&index) { + if let Some(route) = self.irq_routes.get(index as usize) { // When an interrupt is masked the GSI will not be passed to KVM through // KVM_SET_GSI_ROUTING. So, call [`disable()`] to unregister the interrupt file // descriptor before passing the interrupt routes to KVM @@ -593,14 +592,13 @@ impl Vm { /// Create a group of MSI-X interrupts pub fn create_msix_group(vm: Arc, count: u16) -> Result { debug!("Creating new MSI group with {count} vectors"); - let mut irq_routes = HashMap::with_capacity(count as usize); - for (gsi, i) in vm + let mut irq_routes = Vec::with_capacity(count as usize); + for gsi in vm .resource_allocator() .allocate_gsi_msi(count as u32)? .iter() - .zip(0u32..) { - irq_routes.insert(i, MsiVector::new(*gsi, false)?); + irq_routes.push(MsiVector::new(*gsi, false)?); } Ok(MsiVectorGroup { vm, irq_routes }) @@ -821,13 +819,13 @@ pub(crate) mod tests { let msix_group = create_msix_group(&vm); // Initially all vectors are disabled - for route in msix_group.irq_routes.values() { + for route in &msix_group.irq_routes { assert!(!route.enabled.load(Ordering::Acquire)) } // Enable works msix_group.enable().unwrap(); - for route in msix_group.irq_routes.values() { + for route in &msix_group.irq_routes { assert!(route.enabled.load(Ordering::Acquire)); } // Enabling an enabled group doesn't error out @@ -835,7 +833,7 @@ pub(crate) mod tests { // Disable works msix_group.disable().unwrap(); - for route in msix_group.irq_routes.values() { + for route in &msix_group.irq_routes { assert!(!route.enabled.load(Ordering::Acquire)) } // Disabling a disabled group doesn't error out @@ -921,7 +919,7 @@ pub(crate) mod tests { } // All vectors should be disabled - for vector in msix_group.irq_routes.values() { + for vector in &msix_group.irq_routes { assert!(!vector.enabled.load(Ordering::Acquire)); } @@ -1018,8 +1016,8 @@ pub(crate) mod tests { // Even if an MSI group is enabled, we don't save it as such. During restoration, the PCI // transport will make sure the correct config is set for the vectors and enable them // accordingly. - for (id, vector) in msix_group.irq_routes { - let new_vector = restored_group.irq_routes.get(&id).unwrap(); + for (id, vector) in msix_group.irq_routes.iter().enumerate() { + let new_vector = &restored_group.irq_routes[id]; assert_eq!(vector.gsi, new_vector.gsi); assert!(!new_vector.enabled.load(Ordering::Acquire)); } From 3f86ebc5ff5fb1e21b78a384cdbf8428cd1aa2b2 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 29 Jul 2025 11:25:16 +0200 Subject: [PATCH 78/99] msi: fix size of interrupts HashMap We are using a HashMap to track the interrupt routes we use in the system. The index to the HashMap is the GSI of the interrupt route. We know the maximum number of GSIs we have available so pre-allocate the space for the HashMap to avoid reallocations at runtime. Signed-off-by: Babis Chalios --- src/vmm/src/vstate/vm.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index 8d037db96fa..aeb61f88c56 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -28,8 +28,8 @@ use vm_device::interrupt::{ use vmm_sys_util::errno; use vmm_sys_util::eventfd::EventFd; -use crate::arch::host_page_size; pub use crate::arch::{ArchVm as Vm, ArchVmError, VmState}; +use crate::arch::{GSI_MSI_END, host_page_size}; use crate::logger::info; use crate::persist::CreateSnapshotError; use crate::snapshot::Persist; @@ -322,7 +322,7 @@ impl Vm { fd, max_memslots: kvm.max_nr_memslots(), guest_memory: GuestMemoryMmap::default(), - interrupts: Mutex::new(HashMap::new()), + interrupts: Mutex::new(HashMap::with_capacity(GSI_MSI_END as usize + 1)), resource_allocator: Mutex::new(ResourceAllocator::new()), mmio_bus: Arc::new(vm_device::Bus::new()), }) From 2e44dee045c7befecdaab076a9b47c4e5ec2103c Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 29 Jul 2025 14:29:11 +0200 Subject: [PATCH 79/99] fix: return NO_VECTOR when reading MSI vector for invalid queue Fix a bug in the common VirtIO configuration for PCI transport where we would use `queue_select` to read the queue's MSI vector without validating it matches a valid queue. This could lead in panics when accessing the `msix_queue` array. The spec states that in such cases we should return `NO_VECTOR` (0xffff), so do that. Signed-off-by: Babis Chalios --- .../virtio/transport/pci/common_config.rs | 21 +++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/src/vmm/src/devices/virtio/transport/pci/common_config.rs b/src/vmm/src/devices/virtio/transport/pci/common_config.rs index 6e52a1ca007..ae66d54f927 100644 --- a/src/vmm/src/devices/virtio/transport/pci/common_config.rs +++ b/src/vmm/src/devices/virtio/transport/pci/common_config.rs @@ -230,7 +230,19 @@ impl VirtioPciCommonConfig { 0x12 => queues.len().try_into().unwrap(), // num_queues 0x16 => self.queue_select, 0x18 => self.with_queue(queues, |q| q.size).unwrap_or(0), - 0x1a => self.msix_queues.lock().unwrap()[self.queue_select as usize], + // If `queue_select` points to an invalid queue we should return NO_VECTOR. + // Reading from here + // https://docs.oasis-open.org/virtio/virtio/v1.1/csprd01/virtio-v1.1-csprd01.html#x1-1280005: + // + // > The device MUST return vector mapped to a given event, (NO_VECTOR if unmapped) on + // > read of config_msix_vector/queue_msix_vector. + 0x1a => self + .msix_queues + .lock() + .unwrap() + .get(self.queue_select as usize) + .copied() + .unwrap_or(0xffff), 0x1c => u16::from(self.with_queue(queues, |q| q.ready).unwrap_or(false)), 0x1e => self.queue_select, // notify_off _ => { @@ -408,8 +420,13 @@ mod tests { // 'queue_select' can be read and written. regs.write(0x16, &[0xaa, 0x55], dev.clone()); let mut read_back = vec![0x00, 0x00]; - regs.read(0x16, &mut read_back, dev); + regs.read(0x16, &mut read_back, dev.clone()); assert_eq!(read_back[0], 0xaa); assert_eq!(read_back[1], 0x55); + + // Getting the MSI vector when `queue_select` points to an invalid queue should return + // NO_VECTOR (0xffff) + regs.read(0x1a, &mut read_back, dev); + assert_eq!(read_back, [0xff, 0xff]); } } From 30bba8846c24fa7ff0f99b87faee9f1936a16d31 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 29 Jul 2025 15:19:03 +0200 Subject: [PATCH 80/99] fix(pci): correct shift size when setting config address We were shifting by the wrong number of bits for 2-byte accesses. Signed-off-by: Babis Chalios --- src/pci/src/bus.rs | 120 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 118 insertions(+), 2 deletions(-) diff --git a/src/pci/src/bus.rs b/src/pci/src/bus.rs index 775238edff9..adfac6c12fb 100644 --- a/src/pci/src/bus.rs +++ b/src/pci/src/bus.rs @@ -291,8 +291,8 @@ impl PciConfigIo { u32::from(data[0]) << (offset * 8), ), 2 => ( - 0x0000_ffff << (offset * 16), - ((u32::from(data[1]) << 8) | u32::from(data[0])) << (offset * 16), + 0x0000_ffff << (offset * 8), + ((u32::from(data[1]) << 8) | u32::from(data[0])) << (offset * 8), ), 4 => (0xffff_ffff, LittleEndian::read_u32(data)), _ => return, @@ -475,3 +475,119 @@ fn parse_io_config_address(config_address: u32) -> (usize, usize, usize, usize) shift_and_mask(config_address, REGISTER_NUMBER_OFFSET, REGISTER_NUMBER_MASK), ) } + +#[cfg(test)] +mod tests { + use std::sync::{Arc, Mutex}; + + use vm_device::BusDevice; + + use super::{PciBus, PciConfigIo, PciRoot}; + use crate::DeviceRelocation; + + struct RelocationMock; + + impl DeviceRelocation for RelocationMock { + fn move_bar( + &self, + _old_base: u64, + _new_base: u64, + _len: u64, + _pci_dev: &mut dyn crate::PciDevice, + _region_type: crate::PciBarRegionType, + ) -> std::result::Result<(), std::io::Error> { + Ok(()) + } + } + + #[test] + fn test_writing_config_address() { + let mock = Arc::new(RelocationMock); + let root = PciRoot::new(None); + let mut bus = PciConfigIo::new(Arc::new(Mutex::new(PciBus::new(root, mock)))); + + assert_eq!(bus.config_address, 0); + // Writing more than 32 bits will should fail + bus.write(0, 0, &[0x42; 8]); + assert_eq!(bus.config_address, 0); + // Write all the address at once + bus.write(0, 0, &[0x13, 0x12, 0x11, 0x10]); + assert_eq!(bus.config_address, 0x10111213); + // Not writing 32bits at offset 0 should have no effect + bus.write(0, 1, &[0x0; 4]); + assert_eq!(bus.config_address, 0x10111213); + + // Write two bytes at a time + bus.write(0, 0, &[0x42, 0x42]); + assert_eq!(bus.config_address, 0x10114242); + bus.write(0, 1, &[0x43, 0x43]); + assert_eq!(bus.config_address, 0x10434342); + bus.write(0, 2, &[0x44, 0x44]); + assert_eq!(bus.config_address, 0x44444342); + // Writing two bytes at offset 3 should overflow, so it shouldn't have any effect + bus.write(0, 3, &[0x45, 0x45]); + assert_eq!(bus.config_address, 0x44444342); + + // Write one byte at a time + bus.write(0, 0, &[0x0]); + assert_eq!(bus.config_address, 0x44444300); + bus.write(0, 1, &[0x0]); + assert_eq!(bus.config_address, 0x44440000); + bus.write(0, 2, &[0x0]); + assert_eq!(bus.config_address, 0x44000000); + bus.write(0, 3, &[0x0]); + assert_eq!(bus.config_address, 0x00000000); + // Writing past 4 bytes should have no effect + bus.write(0, 4, &[0x13]); + assert_eq!(bus.config_address, 0x0); + } + + #[test] + fn test_reading_config_address() { + let mock = Arc::new(RelocationMock); + let root = PciRoot::new(None); + let mut bus = PciConfigIo::new(Arc::new(Mutex::new(PciBus::new(root, mock)))); + + let mut buffer = [0u8; 4]; + + bus.config_address = 0x13121110; + + // First 4 bytes are the config address + // Next 4 bytes are the values read from the configuration space. + // + // Reading past offset 7 should not return nothing (all 1s) + bus.read(0, 8, &mut buffer); + assert_eq!(buffer, [0xff; 4]); + + // offset + buffer.len() needs to be smaller or equal than 4 + bus.read(0, 1, &mut buffer); + assert_eq!(buffer, [0xff; 4]); + bus.read(0, 2, &mut buffer[..3]); + assert_eq!(buffer, [0xff; 4]); + bus.read(0, 3, &mut buffer[..2]); + assert_eq!(buffer, [0xff; 4]); + + // reading one byte at a time + bus.read(0, 0, &mut buffer[0..1]); + assert_eq!(buffer, [0x10, 0xff, 0xff, 0xff]); + bus.read(0, 1, &mut buffer[1..2]); + assert_eq!(buffer, [0x10, 0x11, 0xff, 0xff]); + bus.read(0, 2, &mut buffer[2..3]); + assert_eq!(buffer, [0x10, 0x11, 0x12, 0xff]); + bus.read(0, 3, &mut buffer[3..4]); + assert_eq!(buffer, [0x10, 0x11, 0x12, 0x13]); + + // reading two bytes at a time + bus.config_address = 0x42434445; + bus.read(0, 0, &mut buffer[..2]); + assert_eq!(buffer, [0x45, 0x44, 0x12, 0x13]); + bus.read(0, 1, &mut buffer[..2]); + assert_eq!(buffer, [0x44, 0x43, 0x12, 0x13]); + bus.read(0, 2, &mut buffer[..2]); + assert_eq!(buffer, [0x43, 0x42, 0x12, 0x13]); + + // reading all of it at once + bus.read(0, 0, &mut buffer); + assert_eq!(buffer, [0x45, 0x44, 0x43, 0x42]); + } +} From d76a8ecfce69866648928b5457c0404f8331c12a Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 30 Jul 2025 10:29:10 +0200 Subject: [PATCH 81/99] fix: only set MSI-X vector for valid queues This is the equivalent to the fix in 741c29f02c491 but for guest writes. We need to make sure that `queue_select` points to a valid queue before setting the MSI-X vector otherwise we'll hit a panic when accessing the underlying `Vec`. Signed-off-by: Babis Chalios --- .../virtio/transport/pci/common_config.rs | 25 +++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/src/vmm/src/devices/virtio/transport/pci/common_config.rs b/src/vmm/src/devices/virtio/transport/pci/common_config.rs index ae66d54f927..00b61e67b67 100644 --- a/src/vmm/src/devices/virtio/transport/pci/common_config.rs +++ b/src/vmm/src/devices/virtio/transport/pci/common_config.rs @@ -258,7 +258,18 @@ impl VirtioPciCommonConfig { 0x10 => self.msix_config.store(value, Ordering::Release), 0x16 => self.queue_select = value, 0x18 => self.with_queue_mut(queues, |q| q.size = value), - 0x1a => self.msix_queues.lock().unwrap()[self.queue_select as usize] = value, + 0x1a => { + // Make sure that `queue_select` points to a valid queue. If not, we won't do + // anything here and subsequent reads at 0x1a will return `NO_VECTOR`. + if let Some(msix_queue) = self + .msix_queues + .lock() + .unwrap() + .get_mut(self.queue_select as usize) + { + *msix_queue = value; + } + } 0x1c => self.with_queue_mut(queues, |q| { q.ready = value == 1; }), @@ -426,7 +437,17 @@ mod tests { // Getting the MSI vector when `queue_select` points to an invalid queue should return // NO_VECTOR (0xffff) - regs.read(0x1a, &mut read_back, dev); + regs.read(0x1a, &mut read_back, dev.clone()); assert_eq!(read_back, [0xff, 0xff]); + + // Writing the MSI vector of an invalid `queue_select` does not have any effect. + regs.write(0x1a, &[0x12, 0x13], dev.clone()); + assert_eq!(read_back, [0xff, 0xff]); + // Valid `queue_select` though should setup the corresponding MSI-X queue. + regs.write(0x16, &[0x1, 0x0], dev.clone()); + assert_eq!(regs.queue_select, 1); + regs.write(0x1a, &[0x12, 0x13], dev.clone()); + regs.read(0x1a, &mut read_back, dev); + assert_eq!(LittleEndian::read_u16(&read_back[..2]), 0x1312); } } From 28cea21a46a71e07782699f5765610ed40ed4aaa Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Wed, 30 Jul 2025 09:49:25 +0100 Subject: [PATCH 82/99] feat: add swiotlb=noforce to default kernel command line swiotlb=noforce disables SWIOTLB, which is enabled by the kernel if the physical addresses exceed 32b. This is not needed for us and causes 64MB to be wasted on the microvm. Signed-off-by: Riccardo Mancini Signed-off-by: Babis Chalios --- src/vmm/src/vmm_config/boot_source.rs | 5 +++-- tests/framework/microvm.py | 2 +- tests/framework/microvm_helpers.py | 2 +- tests/framework/vm_config_missing_vcpu_count.json | 2 +- tests/integration_tests/functional/test_error_code.py | 2 +- .../functional/test_kernel_cmdline.py | 2 +- tests/integration_tests/functional/test_serial_io.py | 10 ++++++---- tests/integration_tests/performance/test_boottime.py | 2 +- tests/integration_tests/performance/test_initrd.py | 2 +- 9 files changed, 16 insertions(+), 13 deletions(-) diff --git a/src/vmm/src/vmm_config/boot_source.rs b/src/vmm/src/vmm_config/boot_source.rs index 297f8abff04..dc21523af3c 100644 --- a/src/vmm/src/vmm_config/boot_source.rs +++ b/src/vmm/src/vmm_config/boot_source.rs @@ -14,8 +14,9 @@ use serde::{Deserialize, Serialize}; /// - `i8042.noaux` do not probe the i8042 controller for an attached mouse (save boot time); /// - `i8042.nomux` do not probe i8042 for a multiplexing controller (save boot time); /// - `i8042.dumbkbd` do not attempt to control kbd state via the i8042 (save boot time). -pub const DEFAULT_KERNEL_CMDLINE: &str = - "reboot=k panic=1 nomodule 8250.nr_uarts=0 i8042.noaux i8042.nomux i8042.dumbkbd"; +/// - `swiotlb=noforce` disable software bounce buffers (SWIOTLB) +pub const DEFAULT_KERNEL_CMDLINE: &str = "reboot=k panic=1 nomodule 8250.nr_uarts=0 i8042.noaux \ + i8042.nomux i8042.dumbkbd swiotlb=noforce"; /// Strongly typed data structure used to configure the boot source of the /// microvm. diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py index 45850cce211..3c672e82e23 100644 --- a/tests/framework/microvm.py +++ b/tests/framework/microvm.py @@ -807,7 +807,7 @@ def basic_config( If boot_args is None, the default boot_args in Firecracker is reboot=k panic=1 nomodule 8250.nr_uarts=0 i8042.noaux i8042.nomux - i8042.nopnp i8042.dumbkbd + i8042.nopnp i8042.dumbkbd swiotlb=noforce if PCI is disabled, Firecracker also passes to the guest pci=off diff --git a/tests/framework/microvm_helpers.py b/tests/framework/microvm_helpers.py index b34da3c447e..f42b63222fb 100644 --- a/tests/framework/microvm_helpers.py +++ b/tests/framework/microvm_helpers.py @@ -127,7 +127,7 @@ def enable_console(self): raise RuntimeError(".spawn already called, too late to enable the console") if self.vm.boot_args is None: self.vm.boot_args = "" - self.vm.boot_args += "console=ttyS0 reboot=k panic=1" + self.vm.boot_args += "console=ttyS0 reboot=k panic=1 swiotlb=noforce" self.vm.jailer.daemonize = False self.vm.jailer.new_pid_ns = False diff --git a/tests/framework/vm_config_missing_vcpu_count.json b/tests/framework/vm_config_missing_vcpu_count.json index 39bb6a38954..719300c96fa 100644 --- a/tests/framework/vm_config_missing_vcpu_count.json +++ b/tests/framework/vm_config_missing_vcpu_count.json @@ -1,7 +1,7 @@ { "boot-source": { "kernel_image_path": "vmlinux.bin", - "boot_args": "console=ttyS0 reboot=k panic=1" + "boot_args": "console=ttyS0 reboot=k panic=1 swiotlb=noforce" }, "drives": [ { diff --git a/tests/integration_tests/functional/test_error_code.py b/tests/integration_tests/functional/test_error_code.py index 171d3853460..321c251cc93 100644 --- a/tests/integration_tests/functional/test_error_code.py +++ b/tests/integration_tests/functional/test_error_code.py @@ -25,7 +25,7 @@ def test_enosys_error_code(uvm_plain): vm.memory_monitor = None vm.basic_config( vcpu_count=1, - boot_args="reboot=k panic=1 init=/usr/local/bin/devmemread", + boot_args="reboot=k panic=1 swiotlb=noforce init=/usr/local/bin/devmemread", ) vm.start() diff --git a/tests/integration_tests/functional/test_kernel_cmdline.py b/tests/integration_tests/functional/test_kernel_cmdline.py index 7ba345f2111..e4e4c122aa9 100644 --- a/tests/integration_tests/functional/test_kernel_cmdline.py +++ b/tests/integration_tests/functional/test_kernel_cmdline.py @@ -21,7 +21,7 @@ def test_init_params(uvm_plain): # Ubuntu version from the /etc/issue file. vm.basic_config( vcpu_count=1, - boot_args="console=ttyS0 reboot=k panic=1 init=/bin/cat -- /etc/issue", + boot_args="console=ttyS0 reboot=k panic=1 swiotlb=noforce init=/bin/cat -- /etc/issue", ) vm.start() diff --git a/tests/integration_tests/functional/test_serial_io.py b/tests/integration_tests/functional/test_serial_io.py index 9005d0896b3..353496576e4 100644 --- a/tests/integration_tests/functional/test_serial_io.py +++ b/tests/integration_tests/functional/test_serial_io.py @@ -55,7 +55,7 @@ def test_serial_after_snapshot(uvm_plain, microvm_factory): microvm.basic_config( vcpu_count=2, mem_size_mib=256, - boot_args="console=ttyS0 reboot=k panic=1", + boot_args="console=ttyS0 reboot=k panic=1 swiotlb=noforce", ) serial = Serial(microvm) serial.open() @@ -99,7 +99,9 @@ def test_serial_console_login(uvm_plain_any): microvm.memory_monitor = None # Set up the microVM with 1 vCPU and a serial console. - microvm.basic_config(vcpu_count=1, boot_args="console=ttyS0 reboot=k panic=1") + microvm.basic_config( + vcpu_count=1, boot_args="console=ttyS0 reboot=k panic=1 swiotlb=noforce" + ) microvm.start() @@ -144,7 +146,7 @@ def test_serial_dos(uvm_plain_any): # Set up the microVM with 1 vCPU and a serial console. microvm.basic_config( vcpu_count=1, - boot_args="console=ttyS0 reboot=k panic=1", + boot_args="console=ttyS0 reboot=k panic=1 swiotlb=noforce", ) microvm.add_net_iface() microvm.start() @@ -178,7 +180,7 @@ def test_serial_block(uvm_plain_any): test_microvm.basic_config( vcpu_count=1, mem_size_mib=512, - boot_args="console=ttyS0 reboot=k panic=1", + boot_args="console=ttyS0 reboot=k panic=1 swiotlb=noforce", ) test_microvm.add_net_iface() test_microvm.start() diff --git a/tests/integration_tests/performance/test_boottime.py b/tests/integration_tests/performance/test_boottime.py index 0e533a43d08..d80bf026a39 100644 --- a/tests/integration_tests/performance/test_boottime.py +++ b/tests/integration_tests/performance/test_boottime.py @@ -12,7 +12,7 @@ DEFAULT_BOOT_ARGS = ( "reboot=k panic=1 nomodule 8250.nr_uarts=0" - " i8042.noaux i8042.nomux i8042.nopnp i8042.dumbkbd" + " i8042.noaux i8042.nomux i8042.nopnp i8042.dumbkbd swiotlb=noforce" ) diff --git a/tests/integration_tests/performance/test_initrd.py b/tests/integration_tests/performance/test_initrd.py index 6cf133e373c..7b92644efa6 100644 --- a/tests/integration_tests/performance/test_initrd.py +++ b/tests/integration_tests/performance/test_initrd.py @@ -35,7 +35,7 @@ def test_microvm_initrd_with_serial(uvm_with_initrd, huge_pages): vm.basic_config( add_root_device=False, vcpu_count=1, - boot_args="console=ttyS0 reboot=k panic=1", + boot_args="console=ttyS0 reboot=k panic=1 swiotlb=noforce", use_initrd=True, huge_pages=huge_pages, ) From cc5cab3b31442fd06c9d911b40a7c89f81288da0 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Fri, 1 Aug 2025 11:50:44 +0200 Subject: [PATCH 83/99] pci: remove unused code We are vending PCI code taken from Cloud Hypervisor's implementation. There is code in there that we don't actually use. So drop it to reduce the dead code in the project. Signed-off-by: Babis Chalios --- src/pci/src/bus.rs | 58 +------- src/pci/src/configuration.rs | 132 +----------------- src/vmm/src/device_manager/pci_mngr.rs | 18 +-- .../devices/virtio/transport/pci/device.rs | 20 +-- 4 files changed, 24 insertions(+), 204 deletions(-) diff --git a/src/pci/src/bus.rs b/src/pci/src/bus.rs index adfac6c12fb..eaa2923e8db 100644 --- a/src/pci/src/bus.rs +++ b/src/pci/src/bus.rs @@ -11,13 +11,10 @@ use std::ops::DerefMut; use std::sync::{Arc, Barrier, Mutex}; use byteorder::{ByteOrder, LittleEndian}; -use vm_device::{Bus, BusDevice, BusDeviceSync}; +use vm_device::BusDevice; -use crate::configuration::{ - PciBarRegionType, PciBridgeSubclass, PciClassCode, PciConfiguration, PciHeaderType, -}; +use crate::configuration::{PciBridgeSubclass, PciClassCode, PciConfiguration, PciHeaderType}; use crate::device::{DeviceRelocation, Error as PciDeviceError, PciDevice}; -use crate::PciBarConfiguration; const VENDOR_ID_INTEL: u16 = 0x8086; const DEVICE_ID_INTEL_VIRT_PCIE_HOST: u16 = 0x0d57; @@ -123,40 +120,11 @@ impl PciBus { } } - pub fn register_mapping( - &self, - dev: Arc, - io_bus: &Bus, - mmio_bus: &Bus, - bars: Vec, - ) -> Result<()> { - for bar in bars { - match bar.region_type() { - PciBarRegionType::IoRegion => { - io_bus - .insert(dev.clone(), bar.addr(), bar.size()) - .map_err(PciRootError::PioInsert)?; - } - PciBarRegionType::Memory32BitRegion | PciBarRegionType::Memory64BitRegion => { - mmio_bus - .insert(dev.clone(), bar.addr(), bar.size()) - .map_err(PciRootError::MmioInsert)?; - } - } - } - Ok(()) - } - pub fn add_device(&mut self, device_id: u32, device: Arc>) -> Result<()> { self.devices.insert(device_id, device); Ok(()) } - pub fn remove_by_device(&mut self, device: &Arc>) -> Result<()> { - self.devices.retain(|_, dev| !Arc::ptr_eq(dev, device)); - Ok(()) - } - pub fn next_device_id(&mut self) -> Result { for (idx, device_id) in self.device_ids.iter_mut().enumerate() { if !(*device_id) { @@ -167,28 +135,6 @@ impl PciBus { Err(PciRootError::NoPciDeviceSlotAvailable) } - - pub fn get_device_id(&mut self, id: usize) -> Result<()> { - if id < NUM_DEVICE_IDS { - if !self.device_ids[id] { - self.device_ids[id] = true; - Ok(()) - } else { - Err(PciRootError::AlreadyInUsePciDeviceSlot(id)) - } - } else { - Err(PciRootError::InvalidPciDeviceSlot(id)) - } - } - - pub fn put_device_id(&mut self, id: usize) -> Result<()> { - if id < NUM_DEVICE_IDS { - self.device_ids[id] = false; - Ok(()) - } else { - Err(PciRootError::InvalidPciDeviceSlot(id)) - } - } } pub struct PciConfigIo { diff --git a/src/pci/src/configuration.rs b/src/pci/src/configuration.rs index c37f8026fbe..531e155a2f8 100644 --- a/src/pci/src/configuration.rs +++ b/src/pci/src/configuration.rs @@ -13,7 +13,7 @@ use serde::{Deserialize, Serialize}; use vm_device::PciBarType; use crate::device::BarReprogrammingParams; -use crate::{MsixConfig, PciInterruptPin}; +use crate::MsixConfig; // The number of 32bit registers in the config space, 4096 bytes. const NUM_CONFIGURATION_REGISTERS: usize = 1024; @@ -22,7 +22,6 @@ const STATUS_REG: usize = 1; const STATUS_REG_CAPABILITIES_USED_MASK: u32 = 0x0010_0000; const BAR0_REG: usize = 4; const ROM_BAR_REG: usize = 12; -const ROM_BAR_IDX: usize = 6; const BAR_IO_ADDR_MASK: u32 = 0xffff_fffc; const BAR_MEM_ADDR_MASK: u32 = 0xffff_fff0; const ROM_BAR_ADDR_MASK: u32 = 0xffff_f800; @@ -33,8 +32,6 @@ const CAPABILITY_LIST_HEAD_OFFSET: usize = 0x34; const FIRST_CAPABILITY_OFFSET: usize = 0x40; const CAPABILITY_MAX_OFFSET: usize = 192; -const INTERRUPT_LINE_PIN_REG: usize = 15; - pub const PCI_CONFIGURATION_ID: &str = "pci_configuration"; /// Represents the types of PCI headers allowed in the configuration registers. @@ -483,11 +480,11 @@ impl From for bool { #[derive(Debug, Copy, Clone, Serialize, Deserialize)] pub struct PciBarConfiguration { - addr: u64, - size: u64, - idx: usize, - region_type: PciBarRegionType, - prefetchable: PciBarPrefetchable, + pub addr: u64, + pub size: u64, + pub idx: usize, + pub region_type: PciBarRegionType, + pub prefetchable: PciBarPrefetchable, } #[derive(Debug)] @@ -797,42 +794,6 @@ impl PciConfiguration { Ok(()) } - /// Adds rom expansion BAR. - pub fn add_pci_rom_bar(&mut self, config: &PciBarConfiguration, active: u32) -> Result<()> { - let bar_idx = config.idx; - let reg_idx = ROM_BAR_REG; - - if self.rom_bar_used { - return Err(Error::RomBarInUse(bar_idx)); - } - - if !config.size.is_power_of_two() { - return Err(Error::RomBarSizeInvalid(config.size)); - } - - if bar_idx != ROM_BAR_IDX { - return Err(Error::RomBarInvalid(bar_idx)); - } - - let end_addr = config - .addr - .checked_add(config.size - 1) - .ok_or(Error::RomBarAddressInvalid(config.addr, config.size))?; - - if end_addr > u64::from(u32::MAX) { - return Err(Error::RomBarAddressInvalid(config.addr, config.size)); - } - - self.registers[reg_idx] = (config.addr as u32) | active; - self.writable_bits[reg_idx] = ROM_BAR_ADDR_MASK; - self.rom_bar_addr = self.registers[reg_idx]; - self.rom_bar_size = - encode_32_bits_bar_size(config.size as u32).ok_or(Error::Encode32BarSize)?; - self.rom_bar_used = true; - - Ok(()) - } - /// Returns the address of the given BAR region. pub fn get_bar_addr(&self, bar_num: usize) -> u64 { let bar_idx = BAR0_REG + bar_num; @@ -848,16 +809,6 @@ impl PciConfiguration { addr } - /// Configures the IRQ line and pin used by this device. - pub fn set_irq(&mut self, line: u8, pin: PciInterruptPin) { - // `pin` is 1-based in the pci config space. - let pin_idx = (pin as u32) + 1; - self.registers[INTERRUPT_LINE_PIN_REG] = (self.registers[INTERRUPT_LINE_PIN_REG] - & 0xffff_0000) - | (pin_idx << 8) - | u32::from(line); - } - /// Adds the capability `cap_data` to the list of capabilities. /// `cap_data` should include the two-byte PCI capability header (type, next), /// but not populate it. Correct values will be generated automatically based @@ -940,10 +891,6 @@ impl PciConfiguration { } } - pub fn read_config_register(&self, reg_idx: usize) -> u32 { - self.read_reg(reg_idx) - } - pub fn detect_bar_reprogramming( &mut self, reg_idx: usize, @@ -1074,73 +1021,6 @@ impl Default for PciBarConfiguration { } } -impl PciBarConfiguration { - pub fn new( - idx: usize, - size: u64, - region_type: PciBarRegionType, - prefetchable: PciBarPrefetchable, - ) -> Self { - PciBarConfiguration { - idx, - addr: 0, - size, - region_type, - prefetchable, - } - } - - #[must_use] - pub fn set_index(mut self, idx: usize) -> Self { - self.idx = idx; - self - } - - #[must_use] - pub fn set_address(mut self, addr: u64) -> Self { - self.addr = addr; - self - } - - #[must_use] - pub fn set_size(mut self, size: u64) -> Self { - self.size = size; - self - } - - #[must_use] - pub fn set_region_type(mut self, region_type: PciBarRegionType) -> Self { - self.region_type = region_type; - self - } - - #[must_use] - pub fn set_prefetchable(mut self, prefetchable: PciBarPrefetchable) -> Self { - self.prefetchable = prefetchable; - self - } - - pub fn idx(&self) -> usize { - self.idx - } - - pub fn addr(&self) -> u64 { - self.addr - } - - pub fn size(&self) -> u64 { - self.size - } - - pub fn region_type(&self) -> PciBarRegionType { - self.region_type - } - - pub fn prefetchable(&self) -> PciBarPrefetchable { - self.prefetchable - } -} - #[cfg(test)] mod tests { use vm_memory::ByteValued; diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs index 0727f76d269..651fda80914 100644 --- a/src/vmm/src/device_manager/pci_mngr.rs +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -89,28 +89,20 @@ impl PciDevices { virtio_device: &Arc>, ) -> Result<(), PciManagerError> { for bar in &virtio_device.lock().expect("Poisoned lock").bar_regions { - match bar.region_type() { + match bar.region_type { PciBarRegionType::IoRegion => { - debug!( - "Inserting I/O BAR region: {:#x}:{:#x}", - bar.addr(), - bar.size() - ); + debug!("Inserting I/O BAR region: {:#x}:{:#x}", bar.addr, bar.size); #[cfg(target_arch = "x86_64")] vm.pio_bus - .insert(virtio_device.clone(), bar.addr(), bar.size())?; + .insert(virtio_device.clone(), bar.addr, bar.size)?; #[cfg(target_arch = "aarch64")] log::error!("pci: We do not support I/O region allocation") } PciBarRegionType::Memory32BitRegion | PciBarRegionType::Memory64BitRegion => { - debug!( - "Inserting MMIO BAR region: {:#x}:{:#x}", - bar.addr(), - bar.size() - ); + debug!("Inserting MMIO BAR region: {:#x}:{:#x}", bar.addr, bar.size); vm.common .mmio_bus - .insert(virtio_device.clone(), bar.addr(), bar.size())?; + .insert(virtio_device.clone(), bar.addr, bar.size)?; } } } diff --git a/src/vmm/src/devices/virtio/transport/pci/device.rs b/src/vmm/src/devices/virtio/transport/pci/device.rs index 7ee580fc6a1..b80f0fc5988 100644 --- a/src/vmm/src/devices/virtio/transport/pci/device.rs +++ b/src/vmm/src/devices/virtio/transport/pci/device.rs @@ -918,11 +918,13 @@ impl PciDevice for VirtioPciDevice { (addr, region_type) }; - let bar = PciBarConfiguration::default() - .set_index(VIRTIO_COMMON_BAR_INDEX) - .set_address(virtio_pci_bar_addr) - .set_size(CAPABILITY_BAR_SIZE) - .set_region_type(region_type); + let bar = PciBarConfiguration { + addr: virtio_pci_bar_addr, + size: CAPABILITY_BAR_SIZE, + idx: VIRTIO_COMMON_BAR_INDEX, + region_type, + prefetchable: pci::PciBarPrefetchable::NotPrefetchable, + }; // The creation of the PCI BAR and its associated capabilities must // happen only during the creation of a brand new VM. When a VM is @@ -948,8 +950,8 @@ impl PciDevice for VirtioPciDevice { mmio64_allocator: &mut AddressAllocator, ) -> std::result::Result<(), PciDeviceError> { for bar in self.bar_regions.drain(..) { - let range = RangeInclusive::new(bar.addr(), bar.addr() + bar.size()).unwrap(); - match bar.region_type() { + let range = RangeInclusive::new(bar.addr, bar.addr + bar.size).unwrap(); + match bar.region_type { PciBarRegionType::Memory32BitRegion => { mmio32_allocator.free(&range); } @@ -970,8 +972,8 @@ impl PciDevice for VirtioPciDevice { // We only update our idea of the bar in order to support free_bars() above. // The majority of the reallocation is done inside DeviceManager. for bar in self.bar_regions.iter_mut() { - if bar.addr() == old_base { - *bar = bar.set_address(new_base); + if bar.addr == old_base { + bar.addr = new_base; } } From cf12e8ee038875307747d1bfb715f46ab6fd6c08 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Fri, 1 Aug 2025 13:18:44 +0200 Subject: [PATCH 84/99] pci: remove unused type from PciDevice::allocate_bars Cloud Hypervisor code is passing a resource type argument in the logic that allocates BARs for devices. `Resource` is an Enum where one of its variants is `PciBar`. Not sure what Cloud Hypervisor uses this for, but it seems redundant since this method is specifically used to allocate BAR memory for devices. We definitely don't use it, so remove it. Signed-off-by: Babis Chalios --- src/pci/src/configuration.rs | 21 ------------------- src/pci/src/device.rs | 4 ---- src/vmm/src/device_manager/pci_mngr.rs | 1 - .../devices/virtio/transport/pci/device.rs | 3 +-- 4 files changed, 1 insertion(+), 28 deletions(-) diff --git a/src/pci/src/configuration.rs b/src/pci/src/configuration.rs index 531e155a2f8..3a2639ca876 100644 --- a/src/pci/src/configuration.rs +++ b/src/pci/src/configuration.rs @@ -10,7 +10,6 @@ use std::sync::{Arc, Mutex}; use byteorder::{ByteOrder, LittleEndian}; use serde::{Deserialize, Serialize}; -use vm_device::PciBarType; use crate::device::BarReprogrammingParams; use crate::MsixConfig; @@ -443,26 +442,6 @@ pub enum PciBarRegionType { Memory64BitRegion = 0x04, } -impl From for PciBarRegionType { - fn from(type_: PciBarType) -> Self { - match type_ { - PciBarType::Io => PciBarRegionType::IoRegion, - PciBarType::Mmio32 => PciBarRegionType::Memory32BitRegion, - PciBarType::Mmio64 => PciBarRegionType::Memory64BitRegion, - } - } -} - -impl From for PciBarType { - fn from(val: PciBarRegionType) -> Self { - match val { - PciBarRegionType::IoRegion => PciBarType::Io, - PciBarRegionType::Memory32BitRegion => PciBarType::Mmio32, - PciBarRegionType::Memory64BitRegion => PciBarType::Mmio64, - } - } -} - #[derive(Debug, Copy, Clone, Serialize, Deserialize)] pub enum PciBarPrefetchable { NotPrefetchable = 0, diff --git a/src/pci/src/device.rs b/src/pci/src/device.rs index bf89331faa9..57f5e63eaeb 100644 --- a/src/pci/src/device.rs +++ b/src/pci/src/device.rs @@ -10,7 +10,6 @@ use std::sync::{Arc, Barrier}; use std::{io, result}; use vm_allocator::AddressAllocator; -use vm_device::Resource; use crate::configuration::{self, PciBarRegionType}; use crate::PciBarConfiguration; @@ -25,8 +24,6 @@ pub enum Error { IoRegistrationFailed(u64, configuration::Error), /// Expected resource not found. MissingResource, - /// Invalid resource - InvalidResource(Resource), } pub type Result = std::result::Result; @@ -45,7 +42,6 @@ pub trait PciDevice: Send { &mut self, _mmio32_allocator: &mut AddressAllocator, _mmio64_allocator: &mut AddressAllocator, - _resources: Option>, ) -> Result> { Ok(Vec::new()) } diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs index 651fda80914..c393c3c963c 100644 --- a/src/vmm/src/device_manager/pci_mngr.rs +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -143,7 +143,6 @@ impl PciDevices { virtio_device.allocate_bars( &mut resource_allocator.mmio32_memory, &mut resource_allocator.mmio64_memory, - None, )?; let virtio_device = Arc::new(Mutex::new(virtio_device)); diff --git a/src/vmm/src/devices/virtio/transport/pci/device.rs b/src/vmm/src/devices/virtio/transport/pci/device.rs index b80f0fc5988..f4ded11fc2d 100644 --- a/src/vmm/src/devices/virtio/transport/pci/device.rs +++ b/src/vmm/src/devices/virtio/transport/pci/device.rs @@ -27,7 +27,7 @@ use serde::{Deserialize, Serialize}; use thiserror::Error; use vm_allocator::{AddressAllocator, AllocPolicy, RangeInclusive}; use vm_device::interrupt::{InterruptIndex, InterruptSourceGroup, MsiIrqGroupConfig}; -use vm_device::{BusDevice, PciBarType, Resource}; +use vm_device::{BusDevice, PciBarType}; use vm_memory::{Address, ByteValued, GuestAddress, Le32}; use vmm_sys_util::errno; use vmm_sys_util::eventfd::EventFd; @@ -886,7 +886,6 @@ impl PciDevice for VirtioPciDevice { &mut self, mmio32_allocator: &mut AddressAllocator, mmio64_allocator: &mut AddressAllocator, - _resources: Option>, ) -> std::result::Result, PciDeviceError> { let mut bars = Vec::new(); let device_clone = self.device.clone(); From dfa69dbe24c4d3f44ad642e914aefa5127826c61 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Fri, 1 Aug 2025 13:32:09 +0200 Subject: [PATCH 85/99] pci: always assume we are using a single 64bit MMIO BAR Despite the fact we are using a single 64bit MMIO BARs for VirtIO devices, we had code that allowed for multiple BARs including BARs of other types (IO and 32bit MMIO). Remove this code and always assume we are only using 64bit BAR at index 0. Signed-off-by: Babis Chalios --- src/vmm/src/device_manager/pci_mngr.rs | 26 ++-- .../devices/virtio/transport/pci/device.rs | 129 ++++++------------ 2 files changed, 51 insertions(+), 104 deletions(-) diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs index c393c3c963c..578d521162b 100644 --- a/src/vmm/src/device_manager/pci_mngr.rs +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -88,24 +88,14 @@ impl PciDevices { vm: &Vm, virtio_device: &Arc>, ) -> Result<(), PciManagerError> { - for bar in &virtio_device.lock().expect("Poisoned lock").bar_regions { - match bar.region_type { - PciBarRegionType::IoRegion => { - debug!("Inserting I/O BAR region: {:#x}:{:#x}", bar.addr, bar.size); - #[cfg(target_arch = "x86_64")] - vm.pio_bus - .insert(virtio_device.clone(), bar.addr, bar.size)?; - #[cfg(target_arch = "aarch64")] - log::error!("pci: We do not support I/O region allocation") - } - PciBarRegionType::Memory32BitRegion | PciBarRegionType::Memory64BitRegion => { - debug!("Inserting MMIO BAR region: {:#x}:{:#x}", bar.addr, bar.size); - vm.common - .mmio_bus - .insert(virtio_device.clone(), bar.addr, bar.size)?; - } - } - } + let virtio_device_locked = virtio_device.lock().expect("Poisoned lock"); + let bar = &virtio_device_locked.bar_region; + assert_eq!(bar.region_type, PciBarRegionType::Memory64BitRegion); + + debug!("Inserting MMIO BAR region: {:#x}:{:#x}", bar.addr, bar.size); + vm.common + .mmio_bus + .insert(virtio_device.clone(), bar.addr, bar.size)?; Ok(()) } diff --git a/src/vmm/src/devices/virtio/transport/pci/device.rs b/src/vmm/src/devices/virtio/transport/pci/device.rs index f4ded11fc2d..0c4b275bb96 100644 --- a/src/vmm/src/devices/virtio/transport/pci/device.rs +++ b/src/vmm/src/devices/virtio/transport/pci/device.rs @@ -67,6 +67,9 @@ const VIRTIO_F_NOTIFICATION_DATA: u32 = 38; /// Vector value used to disable MSI for a queue. const VIRTQ_MSI_NO_VECTOR: u16 = 0xffff; +/// BAR index we are using for VirtIO configuration +const VIRTIO_BAR_INDEX: u8 = 0; + enum PciCapabilityType { Common = 1, Notify = 2, @@ -110,12 +113,12 @@ impl PciCapability for VirtioPciCap { const VIRTIO_PCI_CAP_LEN_OFFSET: u8 = 2; impl VirtioPciCap { - pub fn new(cfg_type: PciCapabilityType, pci_bar: u8, offset: u32, length: u32) -> Self { + pub fn new(cfg_type: PciCapabilityType, offset: u32, length: u32) -> Self { VirtioPciCap { cap_len: u8::try_from(std::mem::size_of::()).unwrap() + VIRTIO_PCI_CAP_LEN_OFFSET, cfg_type: cfg_type as u8, - pci_bar, + pci_bar: VIRTIO_BAR_INDEX, id: 0, padding: [0; 2], offset: Le32::from(offset), @@ -145,19 +148,13 @@ impl PciCapability for VirtioPciNotifyCap { } impl VirtioPciNotifyCap { - pub fn new( - cfg_type: PciCapabilityType, - pci_bar: u8, - offset: u32, - length: u32, - multiplier: Le32, - ) -> Self { + pub fn new(cfg_type: PciCapabilityType, offset: u32, length: u32, multiplier: Le32) -> Self { VirtioPciNotifyCap { cap: VirtioPciCap { cap_len: u8::try_from(std::mem::size_of::()).unwrap() + VIRTIO_PCI_CAP_LEN_OFFSET, cfg_type: cfg_type as u8, - pci_bar, + pci_bar: VIRTIO_BAR_INDEX, id: 0, padding: [0; 2], offset: Le32::from(offset), @@ -231,7 +228,7 @@ impl PciCapability for VirtioPciCfgCap { impl VirtioPciCfgCap { fn new() -> Self { VirtioPciCfgCap { - cap: VirtioPciCap::new(PciCapabilityType::Pci, 0, 0, 0), + cap: VirtioPciCap::new(PciCapabilityType::Pci, 0, 0), ..Default::default() } } @@ -306,7 +303,7 @@ pub struct VirtioPciDeviceState { pub pci_dev_state: VirtioPciCommonConfigState, pub msix_state: MsixConfigState, pub msi_vector_group: Vec, - pub bar_configuration: Vec, + pub bar_configuration: PciBarConfiguration, } #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -348,12 +345,6 @@ pub struct VirtioPciDevice { // Guest memory memory: GuestMemoryMmap, - // Settings PCI BAR - settings_bar: u8, - - // Whether to use 64-bit bar location or 32-bit - use_64bit_bar: bool, - // Add a dedicated structure to hold information about the very specific // virtio-pci capability VIRTIO_PCI_CAP_PCI_CFG. This is needed to support // the legacy/backward compatible mechanism of letting the guest access the @@ -362,8 +353,8 @@ pub struct VirtioPciDevice { // a device. cap_pci_cfg_info: VirtioPciCfgCapInfo, - // Details of bar regions to free - pub bar_regions: Vec, + // Details of BAR region + pub bar_region: PciBarConfiguration, } impl Debug for VirtioPciDevice { @@ -471,11 +462,9 @@ impl VirtioPciDevice { interrupt_status: Arc::new(AtomicUsize::new(0)), virtio_interrupt: Some(interrupt), memory, - settings_bar: 0, - use_64bit_bar: true, interrupt_source_group: msi_vectors, cap_pci_cfg_info: VirtioPciCfgCapInfo::default(), - bar_regions: vec![], + bar_region: PciBarConfiguration::default(), }; Ok(virtio_pci_device) @@ -524,11 +513,9 @@ impl VirtioPciDevice { interrupt_status: Arc::new(AtomicUsize::new(state.interrupt_status)), virtio_interrupt: Some(interrupt), memory: memory.clone(), - settings_bar: 0, - use_64bit_bar: true, interrupt_source_group: msi_vectors, cap_pci_cfg_info, - bar_regions: state.bar_configuration, + bar_region: state.bar_configuration, }; if state.device_activated { @@ -558,17 +545,13 @@ impl VirtioPciDevice { } pub fn config_bar_addr(&self) -> u64 { - self.configuration.get_bar_addr(self.settings_bar as usize) + self.configuration.get_bar_addr(VIRTIO_BAR_INDEX as usize) } - fn add_pci_capabilities( - &mut self, - settings_bar: u8, - ) -> std::result::Result<(), PciDeviceError> { + fn add_pci_capabilities(&mut self) -> std::result::Result<(), PciDeviceError> { // Add pointers to the different configuration structures from the PCI capabilities. let common_cap = VirtioPciCap::new( PciCapabilityType::Common, - settings_bar, COMMON_CONFIG_BAR_OFFSET.try_into().unwrap(), COMMON_CONFIG_SIZE.try_into().unwrap(), ); @@ -578,7 +561,6 @@ impl VirtioPciDevice { let isr_cap = VirtioPciCap::new( PciCapabilityType::Isr, - settings_bar, ISR_CONFIG_BAR_OFFSET.try_into().unwrap(), ISR_CONFIG_SIZE.try_into().unwrap(), ); @@ -589,7 +571,6 @@ impl VirtioPciDevice { // TODO(dgreid) - set based on device's configuration size? let device_cap = VirtioPciCap::new( PciCapabilityType::Device, - settings_bar, DEVICE_CONFIG_BAR_OFFSET.try_into().unwrap(), DEVICE_CONFIG_SIZE.try_into().unwrap(), ); @@ -599,7 +580,6 @@ impl VirtioPciDevice { let notify_cap = VirtioPciNotifyCap::new( PciCapabilityType::Notify, - settings_bar, NOTIFICATION_BAR_OFFSET.try_into().unwrap(), NOTIFICATION_SIZE.try_into().unwrap(), Le32::from(NOTIFY_OFF_MULTIPLIER), @@ -618,10 +598,10 @@ impl VirtioPciDevice { if self.msix_config.is_some() { let msix_cap = MsixCap::new( - settings_bar, + VIRTIO_BAR_INDEX, self.msix_num, MSIX_TABLE_BAR_OFFSET.try_into().unwrap(), - settings_bar, + VIRTIO_BAR_INDEX, MSIX_PBA_BAR_OFFSET.try_into().unwrap(), ); self.configuration @@ -629,7 +609,6 @@ impl VirtioPciDevice { .map_err(PciDeviceError::CapabilitiesSetup)?; } - self.settings_bar = settings_bar; Ok(()) } @@ -744,7 +723,7 @@ impl VirtioPciDevice { .expect("Poisoned lock") .state(), msi_vector_group: self.interrupt_source_group.save(), - bar_configuration: self.bar_regions.clone(), + bar_configuration: self.bar_region, } } } @@ -887,41 +866,25 @@ impl PciDevice for VirtioPciDevice { mmio32_allocator: &mut AddressAllocator, mmio64_allocator: &mut AddressAllocator, ) -> std::result::Result, PciDeviceError> { - let mut bars = Vec::new(); let device_clone = self.device.clone(); let device = device_clone.lock().unwrap(); // Allocate the virtio-pci capability BAR. // See http://docs.oasis-open.org/virtio/virtio/v1.0/cs04/virtio-v1.0-cs04.html#x1-740004 - let (virtio_pci_bar_addr, region_type) = if self.use_64bit_bar { - let region_type = PciBarRegionType::Memory64BitRegion; - let addr = mmio64_allocator - .allocate( - CAPABILITY_BAR_SIZE, - CAPABILITY_BAR_SIZE, - AllocPolicy::FirstMatch, - ) - .unwrap() - .start(); - (addr, region_type) - } else { - let region_type = PciBarRegionType::Memory32BitRegion; - let addr = mmio32_allocator - .allocate( - CAPABILITY_BAR_SIZE, - CAPABILITY_BAR_SIZE, - AllocPolicy::FirstMatch, - ) - .unwrap() - .start(); - (addr, region_type) - }; + let virtio_pci_bar_addr = mmio64_allocator + .allocate( + CAPABILITY_BAR_SIZE, + CAPABILITY_BAR_SIZE, + AllocPolicy::FirstMatch, + ) + .unwrap() + .start(); let bar = PciBarConfiguration { addr: virtio_pci_bar_addr, size: CAPABILITY_BAR_SIZE, idx: VIRTIO_COMMON_BAR_INDEX, - region_type, + region_type: PciBarRegionType::Memory64BitRegion, prefetchable: pci::PciBarPrefetchable::NotPrefetchable, }; @@ -934,13 +897,10 @@ impl PciDevice for VirtioPciDevice { .map_err(|e| PciDeviceError::IoRegistrationFailed(virtio_pci_bar_addr, e))?; // Once the BARs are allocated, the capabilities can be added to the PCI configuration. - self.add_pci_capabilities(VIRTIO_COMMON_BAR_INDEX.try_into().unwrap())?; - - bars.push(bar); - - self.bar_regions.clone_from(&bars); + self.add_pci_capabilities()?; + self.bar_region = bar; - Ok(bars) + Ok(vec![bar]) } fn free_bars( @@ -948,18 +908,17 @@ impl PciDevice for VirtioPciDevice { mmio32_allocator: &mut AddressAllocator, mmio64_allocator: &mut AddressAllocator, ) -> std::result::Result<(), PciDeviceError> { - for bar in self.bar_regions.drain(..) { - let range = RangeInclusive::new(bar.addr, bar.addr + bar.size).unwrap(); - match bar.region_type { - PciBarRegionType::Memory32BitRegion => { - mmio32_allocator.free(&range); - } - PciBarRegionType::Memory64BitRegion => { - mmio64_allocator.free(&range); - } - _ => error!("Unexpected PCI bar type"), - } - } + assert_eq!( + self.bar_region.region_type, + PciBarRegionType::Memory64BitRegion + ); + + let range = RangeInclusive::new( + self.bar_region.addr, + self.bar_region.addr + self.bar_region.size, + ) + .unwrap(); + mmio64_allocator.free(&range); Ok(()) } @@ -970,10 +929,8 @@ impl PciDevice for VirtioPciDevice { ) -> std::result::Result<(), std::io::Error> { // We only update our idea of the bar in order to support free_bars() above. // The majority of the reallocation is done inside DeviceManager. - for bar in self.bar_regions.iter_mut() { - if bar.addr == old_base { - bar.addr = new_base; - } + if self.bar_region.addr == old_base { + self.bar_region.addr = new_base; } Ok(()) From d5dceb75fbd6f065a82f9cb4393ee4a830bfd06e Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Mon, 4 Aug 2025 14:12:46 +0100 Subject: [PATCH 86/99] feat(virtio/interrupt): add trigger_queues method This method is used to notify the guest about queue events in a way that's most performant with the underlying interrupt implementation. As in IrqTrigger there is no distinction between different queues, it's best to send just one interrupt notifiying that "some queues" have a pending event. Conversely, in VirtioInterruptMsix, we need to trigger a MSI for each distinct queue. Signed-off-by: Riccardo Mancini Signed-off-by: Babis Chalios --- src/vmm/src/devices/virtio/transport/mmio.rs | 8 ++++++++ src/vmm/src/devices/virtio/transport/mod.rs | 12 ++++++++++++ 2 files changed, 20 insertions(+) diff --git a/src/vmm/src/devices/virtio/transport/mmio.rs b/src/vmm/src/devices/virtio/transport/mmio.rs index 3a8aa1ad42e..42cfe2b3ed7 100644 --- a/src/vmm/src/devices/virtio/transport/mmio.rs +++ b/src/vmm/src/devices/virtio/transport/mmio.rs @@ -406,6 +406,14 @@ impl VirtioInterrupt for IrqTrigger { } } + fn trigger_queues(&self, queues: &[u16]) -> Result<(), std::io::Error> { + if queues.is_empty() { + Ok(()) + } else { + self.trigger_irq(IrqType::Vring) + } + } + fn notifier(&self, _interrupt_type: VirtioInterruptType) -> Option<&EventFd> { Some(&self.irq_evt) } diff --git a/src/vmm/src/devices/virtio/transport/mod.rs b/src/vmm/src/devices/virtio/transport/mod.rs index c16a7adbe9d..39dfe05a4fd 100644 --- a/src/vmm/src/devices/virtio/transport/mod.rs +++ b/src/vmm/src/devices/virtio/transport/mod.rs @@ -25,6 +25,18 @@ pub trait VirtioInterrupt: std::fmt::Debug + Send + Sync { /// Trigger a VirtIO interrupt. fn trigger(&self, interrupt_type: VirtioInterruptType) -> Result<(), std::io::Error>; + /// Trigger multiple Virtio interrupts for selected queues. + /// The caller needs to ensure that [`queues`] does not include duplicate entries to + /// avoid sending multiple interrupts for the same queue. + /// This is to allow sending a single interrupt for implementations that don't + /// distinguish different queues, like IrqTrigger, instead of sending multiple same + /// interrupts. + fn trigger_queues(&self, queues: &[u16]) -> Result<(), std::io::Error> { + queues + .iter() + .try_for_each(|&qidx| self.trigger(VirtioInterruptType::Queue(qidx))) + } + /// Get the `EventFd` (if any) that backs the underlying interrupt. fn notifier(&self, _interrupt_type: VirtioInterruptType) -> Option<&EventFd> { None From ed591ac04011eacc67b3b3f90085504ed4dfafc6 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Mon, 4 Aug 2025 14:16:32 +0100 Subject: [PATCH 87/99] fix(vsock): restore previous performance with PCI disabled We noticed that, compared to main, the vsock device is up to 30% slower when PCI is disabled. This is due to the refactor in 087e185 ("fix(vsock): pass correct index when triggering interrupts"), as we're now sending interrupts as soon as we detect the event, rather than deferring them and sending only one interrupt at the end. While with MSI we need to send multiple interrupts, so there is little difference (still, tests show up to 5% improvement with this change), with legacy IRQ there's only one interrupt line so we end up sending multiple back to back interrupts rather than a single one. This patch reverts to the previous behaviour and uses the newly introduced `trigger_queues` method to deduplicate interrupts in the case of IrqTrigger. Signed-off-by: Riccardo Mancini Signed-off-by: Babis Chalios --- src/vmm/src/devices/virtio/vsock/device.rs | 10 ++++ .../src/devices/virtio/vsock/event_handler.rs | 52 ++++++++++++------- 2 files changed, 42 insertions(+), 20 deletions(-) diff --git a/src/vmm/src/devices/virtio/vsock/device.rs b/src/vmm/src/devices/virtio/vsock/device.rs index 56426d1ea0f..43c9d4cb2ba 100644 --- a/src/vmm/src/devices/virtio/vsock/device.rs +++ b/src/vmm/src/devices/virtio/vsock/device.rs @@ -148,6 +148,16 @@ where .map_err(DeviceError::FailedSignalingIrq) } + /// Signal the guest which queues are ready to be consumed + pub fn signal_used_queues(&self, used_queues: &[u16]) -> Result<(), DeviceError> { + self.device_state + .active_state() + .expect("Device is not initialized") + .interrupt + .trigger_queues(used_queues) + .map_err(DeviceError::FailedSignalingIrq) + } + /// Walk the driver-provided RX queue buffers and attempt to fill them up with any data that we /// have pending. Return `true` if descriptors have been added to the used ring, and `false` /// otherwise. diff --git a/src/vmm/src/devices/virtio/vsock/event_handler.rs b/src/vmm/src/devices/virtio/vsock/event_handler.rs index e9e325c47e4..a983a332aa3 100755 --- a/src/vmm/src/devices/virtio/vsock/event_handler.rs +++ b/src/vmm/src/devices/virtio/vsock/event_handler.rs @@ -34,6 +34,7 @@ use super::VsockBackend; use super::device::{EVQ_INDEX, RXQ_INDEX, TXQ_INDEX, Vsock}; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::queue::InvalidAvailIdx; +use crate::devices::virtio::vsock::defs::VSOCK_NUM_QUEUES; use crate::devices::virtio::vsock::metrics::METRICS; use crate::logger::IncMetric; @@ -47,11 +48,12 @@ where const PROCESS_EVQ: u32 = 3; const PROCESS_NOTIFY_BACKEND: u32 = 4; - pub fn handle_rxq_event(&mut self, evset: EventSet) { + pub fn handle_rxq_event(&mut self, evset: EventSet) -> Vec { + let mut used_queues = Vec::new(); if evset != EventSet::IN { warn!("vsock: rxq unexpected event {:?}", evset); METRICS.rx_queue_event_fails.inc(); - return; + return used_queues; } if let Err(err) = self.queue_events[RXQ_INDEX].read() { @@ -59,18 +61,19 @@ where METRICS.rx_queue_event_fails.inc(); } else if self.backend.has_pending_rx() { if self.process_rx().unwrap() { - self.signal_used_queue(RXQ_INDEX) - .expect("vsock: Could not trigger device interrupt or RX queue"); + used_queues.push(RXQ_INDEX.try_into().unwrap()); } METRICS.rx_queue_event_count.inc(); } + used_queues } - pub fn handle_txq_event(&mut self, evset: EventSet) { + pub fn handle_txq_event(&mut self, evset: EventSet) -> Vec { + let mut used_queues = Vec::new(); if evset != EventSet::IN { warn!("vsock: txq unexpected event {:?}", evset); METRICS.tx_queue_event_fails.inc(); - return; + return used_queues; } if let Err(err) = self.queue_events[TXQ_INDEX].read() { @@ -78,18 +81,17 @@ where METRICS.tx_queue_event_fails.inc(); } else { if self.process_tx().unwrap() { - self.signal_used_queue(TXQ_INDEX) - .expect("vsock: Could not trigger device interrupt or TX queue"); + used_queues.push(TXQ_INDEX.try_into().unwrap()); } METRICS.tx_queue_event_count.inc(); // The backend may have queued up responses to the packets we sent during // TX queue processing. If that happened, we need to fetch those responses // and place them into RX buffers. if self.backend.has_pending_rx() && self.process_rx().unwrap() { - self.signal_used_queue(RXQ_INDEX) - .expect("vsock: Could not trigger device interrupt or RX queue"); + used_queues.push(RXQ_INDEX.try_into().unwrap()); } } + used_queues } pub fn handle_evq_event(&mut self, evset: EventSet) { @@ -106,7 +108,8 @@ where } /// Notify backend of new events. - pub fn notify_backend(&mut self, evset: EventSet) -> Result<(), InvalidAvailIdx> { + pub fn notify_backend(&mut self, evset: EventSet) -> Result, InvalidAvailIdx> { + let mut used_queues = Vec::new(); self.backend.notify(evset); // After the backend has been kicked, it might've freed up some resources, so we // can attempt to send it more data to process. @@ -114,15 +117,13 @@ where // returning an error) at some point in the past, now is the time to try walking the // TX queue again. if self.process_tx()? { - self.signal_used_queue(TXQ_INDEX) - .expect("vsock: Could not trigger device interrupt or TX queue"); + used_queues.push(TXQ_INDEX.try_into().unwrap()); } if self.backend.has_pending_rx() && self.process_rx()? { - self.signal_used_queue(RXQ_INDEX) - .expect("vsock: Could not trigger device interrupt or RX queue"); + used_queues.push(RXQ_INDEX.try_into().unwrap()) } - Ok(()) + Ok(used_queues) } fn register_runtime_events(&self, ops: &mut EventOps) { @@ -190,14 +191,25 @@ where let evset = event.event_set(); if self.is_activated() { - match source { - Self::PROCESS_ACTIVATE => self.handle_activate_event(ops), + let used_queues = match source { + Self::PROCESS_ACTIVATE => { + self.handle_activate_event(ops); + Vec::new() + } Self::PROCESS_RXQ => self.handle_rxq_event(evset), Self::PROCESS_TXQ => self.handle_txq_event(evset), - Self::PROCESS_EVQ => self.handle_evq_event(evset), + Self::PROCESS_EVQ => { + self.handle_evq_event(evset); + Vec::new() + } Self::PROCESS_NOTIFY_BACKEND => self.notify_backend(evset).unwrap(), - _ => warn!("Unexpected vsock event received: {:?}", source), + _ => { + warn!("Unexpected vsock event received: {:?}", source); + Vec::new() + } }; + self.signal_used_queues(&used_queues) + .expect("vsock: Could not trigger device interrupt"); } else { warn!( "Vsock: The device is not yet activated. Spurious event received: {:?}", From 6cf32b9706a9cfd0855f502c32e835890bf8cf1d Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Fri, 8 Aug 2025 11:24:19 +0200 Subject: [PATCH 88/99] fix(virtio): avoid panic on device activation failure According to the VirtIO spec section 2.1.2, when we enter a failed state we need to to set `DEVICE_NEEDS_RESET` status field and if `DRIVER_OK` is set we need to must send a device configuration change interrupt to the driver. In MMIO code we were trying to follow this logic, but we were trying to get the interrupt object from the device, which fails (and panics) because interrupts are only set in the device upon successful activation. In the PCI transport, instead, we were not doing this at all. The transport layers hold a reference to the interrupts at all times. So, add the missing logic to the PCI transport and use the transport layer reference in both transports to avoid panics. Signed-off-by: Babis Chalios --- src/vmm/src/devices/virtio/transport/mmio.rs | 5 +---- .../devices/virtio/transport/pci/device.rs | 21 ++++++++++++------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/src/vmm/src/devices/virtio/transport/mmio.rs b/src/vmm/src/devices/virtio/transport/mmio.rs index 42cfe2b3ed7..8fbf058e318 100644 --- a/src/vmm/src/devices/virtio/transport/mmio.rs +++ b/src/vmm/src/devices/virtio/transport/mmio.rs @@ -190,10 +190,7 @@ impl MmioTransport { // Section 2.1.2 of the specification states that we need to send a device // configuration change interrupt - let _ = self - .locked_device() - .interrupt_trigger() - .trigger(VirtioInterruptType::Config); + let _ = self.interrupt.trigger(VirtioInterruptType::Config); error!("Failed to activate virtio device: {}", err) } diff --git a/src/vmm/src/devices/virtio/transport/pci/device.rs b/src/vmm/src/devices/virtio/transport/pci/device.rs index 0c4b275bb96..ba91163fe49 100644 --- a/src/vmm/src/devices/virtio/transport/pci/device.rs +++ b/src/vmm/src/devices/virtio/transport/pci/device.rs @@ -1029,15 +1029,22 @@ impl PciDevice for VirtioPciDevice { // Try and activate the device if the driver status has changed if self.needs_activation() { debug!("Activating device"); - self.virtio_device() + let interrupt = Arc::clone(self.virtio_interrupt.as_ref().unwrap()); + match self + .virtio_device() .lock() .unwrap() - .activate( - self.memory.clone(), - Arc::clone(self.virtio_interrupt.as_ref().unwrap()), - ) - .unwrap_or_else(|err| error!("Error activating device: {err:?}")); - self.device_activated.store(true, Ordering::SeqCst); + .activate(self.memory.clone(), interrupt.clone()) + { + Ok(()) => self.device_activated.store(true, Ordering::SeqCst), + Err(err) => { + error!("Error activating device: {err:?}"); + + // Section 2.1.2 of the specification states that we need to send a device + // configuration change interrupt + let _ = interrupt.trigger(VirtioInterruptType::Config); + } + } } else { debug!("Device doesn't need activation"); } From eab2b40d2e20712276462d1a8047d6be686ac37f Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Fri, 8 Aug 2025 11:35:39 +0200 Subject: [PATCH 89/99] fix(mmio): avoid locking multiple times in same code branch MMIO transport layer holds a Mutex to the VirtIO device. Within the logic that handles the handshake between the driver and the device, there were cases where we would take and release the lock multiple times within the same code branch. Change this to only take the lock once. Signed-off-by: Babis Chalios --- src/vmm/src/devices/virtio/transport/mmio.rs | 27 +++++++++++--------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/src/vmm/src/devices/virtio/transport/mmio.rs b/src/vmm/src/devices/virtio/transport/mmio.rs index 8fbf058e318..4964f837aca 100644 --- a/src/vmm/src/devices/virtio/transport/mmio.rs +++ b/src/vmm/src/devices/virtio/transport/mmio.rs @@ -179,12 +179,12 @@ impl MmioTransport { } DRIVER_OK if self.device_status == (ACKNOWLEDGE | DRIVER | FEATURES_OK) => { self.device_status = status; - let device_activated = self.locked_device().is_activated(); + let mut locked_device = self.device.lock().expect("Poisoned lock"); + let device_activated = locked_device.is_activated(); if !device_activated { // temporary variable needed for borrow checker - let activate_result = self - .locked_device() - .activate(self.mem.clone(), self.interrupt.clone()); + let activate_result = + locked_device.activate(self.mem.clone(), self.interrupt.clone()); if let Err(err) = activate_result { self.device_status |= DEVICE_NEEDS_RESET; @@ -201,16 +201,19 @@ impl MmioTransport { self.device_status |= FAILED; } _ if status == 0 => { - if self.locked_device().is_activated() { - let mut device_status = self.device_status; - let reset_result = self.locked_device().reset(); - match reset_result { - Some((_interrupt_evt, mut _queue_evts)) => {} - None => { - device_status |= FAILED; + { + let mut locked_device = self.device.lock().expect("Poisoned lock"); + if locked_device.is_activated() { + let mut device_status = self.device_status; + let reset_result = locked_device.reset(); + match reset_result { + Some((_interrupt_evt, mut _queue_evts)) => {} + None => { + device_status |= FAILED; + } } + self.device_status = device_status; } - self.device_status = device_status; } // If the backend device driver doesn't support reset, From efccd18bc39f47a937829bd05e60a150dc1270a9 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Mon, 11 Aug 2025 13:38:39 +0200 Subject: [PATCH 90/99] fix(virtio-pci): check guest values for MSI-X vector We should check that the MSI-X vector that a guest assigns to a VirtIO queue or configuration is a valid one. If it is a value bigger than the available vectors allocated for the device, it should not update the vector field and mark the corresponding vector to the NO_VECTOR value. Also, add checks in the MSI implementation for VirtioInterrupt trait, for making sure that we don't try to interact with out-of-range interrupt vectors. Signed-off-by: Babis Chalios --- .../virtio/transport/pci/common_config.rs | 40 ++++++++++++++----- .../devices/virtio/transport/pci/device.rs | 21 ++++++---- 2 files changed, 42 insertions(+), 19 deletions(-) diff --git a/src/vmm/src/devices/virtio/transport/pci/common_config.rs b/src/vmm/src/devices/virtio/transport/pci/common_config.rs index 00b61e67b67..d353b04c43e 100644 --- a/src/vmm/src/devices/virtio/transport/pci/common_config.rs +++ b/src/vmm/src/devices/virtio/transport/pci/common_config.rs @@ -16,6 +16,7 @@ use vm_memory::GuestAddress; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::queue::Queue; +use crate::devices::virtio::transport::pci::device::VIRTQ_MSI_NO_VECTOR; use crate::logger::{debug, error, info, trace, warn}; pub const VIRTIO_PCI_COMMON_CONFIG_ID: &str = "virtio_pci_common_config"; @@ -242,7 +243,7 @@ impl VirtioPciCommonConfig { .unwrap() .get(self.queue_select as usize) .copied() - .unwrap_or(0xffff), + .unwrap_or(VIRTQ_MSI_NO_VECTOR), 0x1c => u16::from(self.with_queue(queues, |q| q.ready).unwrap_or(false)), 0x1e => self.queue_select, // notify_off _ => { @@ -255,19 +256,36 @@ impl VirtioPciCommonConfig { fn write_common_config_word(&mut self, offset: u64, value: u16, queues: &mut [Queue]) { debug!("write_common_config_word: offset 0x{:x}", offset); match offset { - 0x10 => self.msix_config.store(value, Ordering::Release), + 0x10 => { + // Make sure that the guest doesn't select an invalid vector. We are offering + // `num_queues + 1` vectors (plus one for configuration updates). If an invalid + // vector has been selected, we just store the `NO_VECTOR` value. + let mut msix_queues = self.msix_queues.lock().expect("Poisoned lock"); + let nr_vectors = msix_queues.len() + 1; + + if (value as usize) < nr_vectors { + self.msix_config.store(value, Ordering::Release); + } else { + self.msix_config + .store(VIRTQ_MSI_NO_VECTOR, Ordering::Release); + } + } 0x16 => self.queue_select = value, 0x18 => self.with_queue_mut(queues, |q| q.size = value), 0x1a => { + let mut msix_queues = self.msix_queues.lock().expect("Poisoned lock"); + let nr_vectors = msix_queues.len() + 1; // Make sure that `queue_select` points to a valid queue. If not, we won't do // anything here and subsequent reads at 0x1a will return `NO_VECTOR`. - if let Some(msix_queue) = self - .msix_queues - .lock() - .unwrap() - .get_mut(self.queue_select as usize) - { - *msix_queue = value; + if let Some(queue) = msix_queues.get_mut(self.queue_select as usize) { + // Make sure that the guest doesn't select an invalid vector. We are offering + // `num_queues + 1` vectors (plus one for configuration updates). If an invalid + // vector has been selected, we just store the `NO_VECTOR` value. + if (value as usize) < nr_vectors { + *queue = value; + } else { + *queue = VIRTQ_MSI_NO_VECTOR; + } } } 0x1c => self.with_queue_mut(queues, |q| { @@ -446,8 +464,8 @@ mod tests { // Valid `queue_select` though should setup the corresponding MSI-X queue. regs.write(0x16, &[0x1, 0x0], dev.clone()); assert_eq!(regs.queue_select, 1); - regs.write(0x1a, &[0x12, 0x13], dev.clone()); + regs.write(0x1a, &[0x1, 0x0], dev.clone()); regs.read(0x1a, &mut read_back, dev); - assert_eq!(LittleEndian::read_u16(&read_back[..2]), 0x1312); + assert_eq!(LittleEndian::read_u16(&read_back[..2]), 0x1); } } diff --git a/src/vmm/src/devices/virtio/transport/pci/device.rs b/src/vmm/src/devices/virtio/transport/pci/device.rs index ba91163fe49..9daf88201ac 100644 --- a/src/vmm/src/devices/virtio/transport/pci/device.rs +++ b/src/vmm/src/devices/virtio/transport/pci/device.rs @@ -11,7 +11,7 @@ use std::any::Any; use std::cmp; use std::collections::HashMap; use std::fmt::{Debug, Formatter}; -use std::io::Write; +use std::io::{ErrorKind, Write}; use std::sync::atomic::{AtomicBool, AtomicU16, AtomicU32, AtomicUsize, Ordering}; use std::sync::{Arc, Barrier, Mutex}; @@ -65,7 +65,7 @@ const VIRTIO_F_SR_IOV: u32 = 37; const VIRTIO_F_NOTIFICATION_DATA: u32 = 38; /// Vector value used to disable MSI for a queue. -const VIRTQ_MSI_NO_VECTOR: u16 = 0xffff; +pub const VIRTQ_MSI_NO_VECTOR: u16 = 0xffff; /// BAR index we are using for VirtIO configuration const VIRTIO_BAR_INDEX: u8 = 0; @@ -765,9 +765,12 @@ impl VirtioInterrupt for VirtioInterruptMsix { fn trigger(&self, int_type: VirtioInterruptType) -> std::result::Result<(), std::io::Error> { let vector = match int_type { VirtioInterruptType::Config => self.config_vector.load(Ordering::Acquire), - VirtioInterruptType::Queue(queue_index) => { - self.queues_vectors.lock().unwrap()[queue_index as usize] - } + VirtioInterruptType::Queue(queue_index) => *self + .queues_vectors + .lock() + .unwrap() + .get(queue_index as usize) + .ok_or(ErrorKind::InvalidInput)?, }; if vector == VIRTQ_MSI_NO_VECTOR { @@ -793,9 +796,11 @@ impl VirtioInterrupt for VirtioInterruptMsix { fn notifier(&self, int_type: VirtioInterruptType) -> Option<&EventFd> { let vector = match int_type { VirtioInterruptType::Config => self.config_vector.load(Ordering::Acquire), - VirtioInterruptType::Queue(queue_index) => { - self.queues_vectors.lock().unwrap()[queue_index as usize] - } + VirtioInterruptType::Queue(queue_index) => *self + .queues_vectors + .lock() + .unwrap() + .get(queue_index as usize)?, }; self.interrupt_source_group From bd22a2dade76a1d26c0416691db61b6d04bfe8c3 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Mon, 11 Aug 2025 17:38:53 +0100 Subject: [PATCH 91/99] refactor(pcie): avoid string format on good path If a function is called in the `.ok_or`, this gets executed independently on whether the function succeeds or not, although the result is only used on failure. Replace it with a .ok_or_else to avoid a useless string allocation and memcpy. Signed-off-by: Riccardo Mancini Signed-off-by: Babis Chalios --- src/vmm/src/vstate/vm.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index aeb61f88c56..04ad9bc7907 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -170,7 +170,7 @@ impl InterruptSourceGroup for MsiVectorGroup { fn trigger(&self, index: InterruptIndex) -> vm_device::interrupt::Result<()> { self.notifier(index) - .ok_or(std::io::Error::other(format!( + .ok_or_else(|| std::io::Error::other(format!( "trigger: invalid interrupt index {index}" )))? .write(1) From a5cb371f2d0c3722bb0fe445ffac646c5a89aec4 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Mon, 11 Aug 2025 17:41:09 +0100 Subject: [PATCH 92/99] chore(clippy): enable warn for or_fun_call "or_fun_call" warns us when a function is called in side a "*_or(...)" function. In this case the value is computed (which may be expensive), but is only used if the Result/Option is an Error/None. In these cases, using the "*_or_else" variant is the best thing to do. Signed-off-by: Riccardo Mancini Signed-off-by: Babis Chalios --- Cargo.toml | 1 + src/clippy-tracing/src/main.rs | 2 +- src/cpu-template-helper/src/template/verify/mod.rs | 2 +- src/vmm/src/vstate/vm.rs | 6 +++--- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 37a76cdd34f..a1c9ad79621 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,6 +29,7 @@ exit = "warn" tests_outside_test_module = "warn" assertions_on_result_states = "warn" error_impl_error = "warn" +or_fun_call = "warn" [profile.dev] panic = "abort" diff --git a/src/clippy-tracing/src/main.rs b/src/clippy-tracing/src/main.rs index c89fb6a5d37..721fca12b25 100644 --- a/src/clippy-tracing/src/main.rs +++ b/src/clippy-tracing/src/main.rs @@ -260,7 +260,7 @@ impl Error for ExecError {} fn exec() -> Result, ExecError> { let args = CommandLineArgs::parse(); - let path = args.path.unwrap_or(PathBuf::from(".")); + let path = args.path.unwrap_or_else(|| PathBuf::from(".")); for entry_res in WalkDir::new(path).follow_links(true) { let entry = entry_res.map_err(ExecError::Entry)?; let entry_path = entry.into_path(); diff --git a/src/cpu-template-helper/src/template/verify/mod.rs b/src/cpu-template-helper/src/template/verify/mod.rs index 1a83f6ba1b2..1f42e2f06cc 100644 --- a/src/cpu-template-helper/src/template/verify/mod.rs +++ b/src/cpu-template-helper/src/template/verify/mod.rs @@ -43,7 +43,7 @@ where for (key, template_value_filter) in template { let config_value_filter = config .get(&key) - .ok_or(VerifyError::KeyNotFound(key.to_string()))?; + .ok_or_else(|| VerifyError::KeyNotFound(key.to_string()))?; let template_value = template_value_filter.value & template_value_filter.filter; let config_value = config_value_filter.value & template_value_filter.filter; diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index 04ad9bc7907..8c4049f9e0c 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -170,9 +170,9 @@ impl InterruptSourceGroup for MsiVectorGroup { fn trigger(&self, index: InterruptIndex) -> vm_device::interrupt::Result<()> { self.notifier(index) - .ok_or_else(|| std::io::Error::other(format!( - "trigger: invalid interrupt index {index}" - )))? + .ok_or_else(|| { + std::io::Error::other(format!("trigger: invalid interrupt index {index}")) + })? .write(1) } From a61390a84cf401f4be471f01a63f68afe65865cf Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Mon, 4 Aug 2025 12:23:06 +0200 Subject: [PATCH 93/99] pci: remove unused members of the PciDevice trait We are not really using the `as_any` and `id` members of the PciDevice trait. At the moment, only VirtIO devices and the PCI root port is implementing PciDevice and we are never iterating over the container of PCI devices. Signed-off-by: Babis Chalios --- src/pci/src/bus.rs | 9 ------ src/pci/src/device.rs | 12 ++------ .../devices/virtio/transport/pci/device.rs | 30 +------------------ 3 files changed, 3 insertions(+), 48 deletions(-) diff --git a/src/pci/src/bus.rs b/src/pci/src/bus.rs index eaa2923e8db..4f19360c097 100644 --- a/src/pci/src/bus.rs +++ b/src/pci/src/bus.rs @@ -5,7 +5,6 @@ // // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause -use std::any::Any; use std::collections::HashMap; use std::ops::DerefMut; use std::sync::{Arc, Barrier, Mutex}; @@ -87,14 +86,6 @@ impl PciDevice for PciRoot { fn read_config_register(&mut self, reg_idx: usize) -> u32 { self.config.read_reg(reg_idx) } - - fn as_any_mut(&mut self) -> &mut dyn Any { - self - } - - fn id(&self) -> Option { - None - } } pub struct PciBus { diff --git a/src/pci/src/device.rs b/src/pci/src/device.rs index 57f5e63eaeb..ba84e6ec6bf 100644 --- a/src/pci/src/device.rs +++ b/src/pci/src/device.rs @@ -5,14 +5,12 @@ // // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause -use std::any::Any; use std::sync::{Arc, Barrier}; use std::{io, result}; use vm_allocator::AddressAllocator; use crate::configuration::{self, PciBarRegionType}; -use crate::PciBarConfiguration; #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum Error { @@ -42,8 +40,8 @@ pub trait PciDevice: Send { &mut self, _mmio32_allocator: &mut AddressAllocator, _mmio64_allocator: &mut AddressAllocator, - ) -> Result> { - Ok(Vec::new()) + ) -> Result<()> { + Ok(()) } /// Frees the PCI BARs previously allocated with a call to allocate_bars(). @@ -89,12 +87,6 @@ pub trait PciDevice: Send { fn move_bar(&mut self, _old_base: u64, _new_base: u64) -> result::Result<(), io::Error> { Ok(()) } - /// Provides a mutable reference to the Any trait. This is useful to let - /// the caller have access to the underlying type behind the trait. - fn as_any_mut(&mut self) -> &mut dyn Any; - - /// Optionally returns a unique identifier. - fn id(&self) -> Option; } /// This trait defines a set of functions which can be triggered whenever a diff --git a/src/vmm/src/devices/virtio/transport/pci/device.rs b/src/vmm/src/devices/virtio/transport/pci/device.rs index 9daf88201ac..12d6ff10345 100644 --- a/src/vmm/src/devices/virtio/transport/pci/device.rs +++ b/src/vmm/src/devices/virtio/transport/pci/device.rs @@ -7,7 +7,6 @@ // // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause -use std::any::Any; use std::cmp; use std::collections::HashMap; use std::fmt::{Debug, Formatter}; @@ -870,7 +869,7 @@ impl PciDevice for VirtioPciDevice { &mut self, mmio32_allocator: &mut AddressAllocator, mmio64_allocator: &mut AddressAllocator, - ) -> std::result::Result, PciDeviceError> { + ) -> std::result::Result<(), PciDeviceError> { let device_clone = self.device.clone(); let device = device_clone.lock().unwrap(); @@ -905,25 +904,6 @@ impl PciDevice for VirtioPciDevice { self.add_pci_capabilities()?; self.bar_region = bar; - Ok(vec![bar]) - } - - fn free_bars( - &mut self, - mmio32_allocator: &mut AddressAllocator, - mmio64_allocator: &mut AddressAllocator, - ) -> std::result::Result<(), PciDeviceError> { - assert_eq!( - self.bar_region.region_type, - PciBarRegionType::Memory64BitRegion - ); - - let range = RangeInclusive::new( - self.bar_region.addr, - self.bar_region.addr + self.bar_region.size, - ) - .unwrap(); - mmio64_allocator.free(&range); Ok(()) } @@ -1083,14 +1063,6 @@ impl PciDevice for VirtioPciDevice { None } - - fn id(&self) -> Option { - Some(self.id.clone()) - } - - fn as_any_mut(&mut self) -> &mut dyn Any { - self - } } impl BusDevice for VirtioPciDevice { From 57aab34dc659fc0f6fbb7f7796df66f7c0ec3cdd Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 5 Aug 2025 09:15:44 +0200 Subject: [PATCH 94/99] pci: add unit test for adding BARs in config space Add a unit test for the logic that handles adding BARs for a device. Also, ensure that we check that the BAR index we are adding is within range before we use it to index the BARs vector. Signed-off-by: Babis Chalios --- src/pci/src/configuration.rs | 172 ++++++++++++++++++++++++++++++++++- 1 file changed, 167 insertions(+), 5 deletions(-) diff --git a/src/pci/src/configuration.rs b/src/pci/src/configuration.rs index 3a2639ca876..03f63f2c45f 100644 --- a/src/pci/src/configuration.rs +++ b/src/pci/src/configuration.rs @@ -706,6 +706,10 @@ impl PciConfiguration { let bar_idx = config.idx; let reg_idx = BAR0_REG + bar_idx; + if bar_idx >= NUM_BAR_REGS { + return Err(Error::BarInvalid(bar_idx)); + } + if self.bars[bar_idx].used { return Err(Error::BarInUse(bar_idx)); } @@ -714,10 +718,6 @@ impl PciConfiguration { return Err(Error::BarSizeInvalid(config.size)); } - if bar_idx >= NUM_BAR_REGS { - return Err(Error::BarInvalid(bar_idx)); - } - let end_addr = config .addr .checked_add(config.size - 1) @@ -739,7 +739,7 @@ impl PciConfiguration { } if self.bars[bar_idx + 1].used { - return Err(Error::BarInUse64(bar_idx)); + return Err(Error::BarInUse64(bar_idx + 1)); } // Encode the BAR size as expected by the software running in @@ -1002,6 +1002,7 @@ impl Default for PciBarConfiguration { #[cfg(test)] mod tests { + use vm_memory::ByteValued; use super::*; @@ -1108,4 +1109,165 @@ mod tests { assert_eq!(subclass, 0x01); assert_eq!(prog_if, 0x5a); } + + #[test] + fn test_bar_size_encoding() { + assert!(encode_32_bits_bar_size(0).is_none()); + assert!(decode_32_bits_bar_size(0).is_none()); + assert!(encode_64_bits_bar_size(0).is_none()); + assert!(decode_64_bits_bar_size(0, 0).is_none()); + + // According to OSDev wiki (https://wiki.osdev.org/PCI#Address_and_size_of_the_BAR): + // + // > To determine the amount of address space needed by a PCI device, you must save the + // > original value of the BAR, write a value of all 1's to the register, then read it back. + // > The amount of memory can then be determined by masking the information bits, performing + // > a bitwise NOT ('~' in C), and incrementing the value by 1. The original value of the + // BAR > should then be restored. The BAR register is naturally aligned and as such you can + // only > modify the bits that are set. For example, if a device utilizes 16 MB it will + // have BAR0 > filled with 0xFF000000 (0x1000000 after decoding) and you can only modify + // the upper > 8-bits. + // + // So we should be encoding an address like this: `addr` -> `!(addr - 1)` + let encoded = encode_32_bits_bar_size(0x0101_0101).unwrap(); + assert_eq!(encoded, 0xfefe_feff); + assert_eq!(decode_32_bits_bar_size(encoded), Some(0x0101_0101)); + + // Similarly we encode a 64 bits size and then store it as a 2 32bit addresses (we use + // two BARs). + let (hi, lo) = encode_64_bits_bar_size(0xffff_ffff_ffff_fff0).unwrap(); + assert_eq!(hi, 0); + assert_eq!(lo, 0x0000_0010); + assert_eq!(decode_64_bits_bar_size(hi, lo), Some(0xffff_ffff_ffff_fff0)); + } + + #[test] + fn test_add_pci_bar() { + let mut pci_config = PciConfiguration::new( + 0x42, + 0x0, + 0x0, + PciClassCode::MassStorage, + &PciMassStorageSubclass::SerialScsiController, + None, + PciHeaderType::Device, + 0x13, + 0x12, + None, + None, + ); + + // BAR size can only be a power of 2 + assert!(matches!( + pci_config.add_pci_bar(&PciBarConfiguration { + addr: 0x1000, + size: 0x1001, + idx: 0, + region_type: PciBarRegionType::Memory32BitRegion, + prefetchable: PciBarPrefetchable::Prefetchable, + }), + Err(Error::BarSizeInvalid(0x1001)) + )); + + // Invalid BAR index + assert!(matches!( + pci_config.add_pci_bar(&PciBarConfiguration { + addr: 0x1000, + size: 0x1000, + idx: NUM_BAR_REGS, + region_type: PciBarRegionType::Memory32BitRegion, + prefetchable: PciBarPrefetchable::Prefetchable + }), + Err(Error::BarInvalid(NUM_BAR_REGS)) + )); + // 64bit BARs need 2 BAR slots actually + assert!(matches!( + pci_config.add_pci_bar(&PciBarConfiguration { + addr: 0x1000, + size: 0x1000, + idx: NUM_BAR_REGS - 1, + region_type: PciBarRegionType::Memory64BitRegion, + prefetchable: PciBarPrefetchable::Prefetchable + }), + Err(Error::BarInvalid64(_)) + )); + + // Check for valid addresses + // Can't have an address that exceeds 32 bits for a 32bit BAR + assert!(matches!( + pci_config.add_pci_bar(&PciBarConfiguration { + addr: 0x1000_0000_0000_0000, + size: 0x1000, + idx: 0, + region_type: PciBarRegionType::Memory32BitRegion, + prefetchable: PciBarPrefetchable::Prefetchable + }), + Err(Error::BarAddressInvalid(0x1000_0000_0000_0000, 0x1000)) + )); + // Ensure that we handle properly overflows in 64bit BAR ranges + assert!(matches!( + pci_config.add_pci_bar(&PciBarConfiguration { + addr: u64::MAX, + size: 0x2, + idx: 0, + region_type: PciBarRegionType::Memory64BitRegion, + prefetchable: PciBarPrefetchable::Prefetchable + }), + Err(Error::BarAddressInvalid(u64::MAX, 2)) + )); + + // We can't reuse a BAR slot + pci_config + .add_pci_bar(&PciBarConfiguration { + addr: 0x1000, + size: 0x1000, + idx: 0, + region_type: PciBarRegionType::Memory32BitRegion, + prefetchable: PciBarPrefetchable::Prefetchable, + }) + .unwrap(); + assert!(matches!( + pci_config.add_pci_bar(&PciBarConfiguration { + addr: 0x1000, + size: 0x1000, + idx: 0, + region_type: PciBarRegionType::Memory32BitRegion, + prefetchable: PciBarPrefetchable::Prefetchable, + }), + Err(Error::BarInUse(0)) + )); + pci_config + .add_pci_bar(&PciBarConfiguration { + addr: 0x0000_0001_0000_0000, + size: 0x2000, + idx: 2, + region_type: PciBarRegionType::Memory64BitRegion, + prefetchable: PciBarPrefetchable::Prefetchable, + }) + .unwrap(); + // For 64bit BARs two BARs are used (in this case BARs 1 and 2) + assert!(matches!( + pci_config.add_pci_bar(&PciBarConfiguration { + addr: 0x0000_0001_0000_0000, + size: 0x1000, + idx: 2, + region_type: PciBarRegionType::Memory64BitRegion, + prefetchable: PciBarPrefetchable::Prefetchable, + }), + Err(Error::BarInUse(2)) + )); + assert!(matches!( + pci_config.add_pci_bar(&PciBarConfiguration { + addr: 0x0000_0001_0000_0000, + size: 0x1000, + idx: 1, + region_type: PciBarRegionType::Memory64BitRegion, + prefetchable: PciBarPrefetchable::Prefetchable, + }), + Err(Error::BarInUse64(2)) + )); + + assert_eq!(pci_config.get_bar_addr(0), 0x1000); + assert_eq!(pci_config.get_bar_addr(2), 0x1_0000_0000); + } } From 51a1aa8cb5a1ee13183746d9dce94e3c0f6af3b0 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 5 Aug 2025 13:40:45 +0200 Subject: [PATCH 95/99] pci: add unit test for configuring MSI-X capability Make sure that configuring the MSI-X capability in PCI configuration space works properly (configuration space is initialized as expected). Also, make sure that the implementation respects the read/write properties of the respective bits. Finally, fix logic in the code that adds capabilities to take into account properly the size of capabilities. Signed-off-by: Babis Chalios --- src/pci/src/configuration.rs | 157 ++++++++++++++++++++++++++++++++--- 1 file changed, 144 insertions(+), 13 deletions(-) diff --git a/src/pci/src/configuration.rs b/src/pci/src/configuration.rs index 03f63f2c45f..1f7c2e2ac2e 100644 --- a/src/pci/src/configuration.rs +++ b/src/pci/src/configuration.rs @@ -190,7 +190,7 @@ pub trait PciProgrammingInterface { } /// Types of PCI capabilities. -#[derive(PartialEq, Eq, Copy, Clone)] +#[derive(Debug, PartialEq, Eq, Copy, Clone)] #[allow(dead_code)] #[allow(non_camel_case_types)] #[repr(u8)] @@ -474,8 +474,6 @@ pub enum Error { BarInvalid(usize), BarInvalid64(usize), BarSizeInvalid(u64), - CapabilityEmpty, - CapabilityLengthInvalid(usize), CapabilitySpaceFull(usize), Decode32BarSize, Decode64BarSize, @@ -505,8 +503,6 @@ impl Display for Error { NUM_BAR_REGS - 1 ), BarSizeInvalid(s) => write!(f, "bar address {s} not a power of two"), - CapabilityEmpty => write!(f, "empty capabilities are invalid"), - CapabilityLengthInvalid(l) => write!(f, "Invalid capability length {l}"), CapabilitySpaceFull(s) => write!(f, "capability of size {s} doesn't fit"), Decode32BarSize => write!(f, "failed to decode 32 bits BAR size"), Decode64BarSize => write!(f, "failed to decode 64 bits BAR size"), @@ -789,15 +785,12 @@ impl PciConfiguration { } /// Adds the capability `cap_data` to the list of capabilities. - /// `cap_data` should include the two-byte PCI capability header (type, next), - /// but not populate it. Correct values will be generated automatically based - /// on `cap_data.id()`. + /// + /// `cap_data` should not include the two-byte PCI capability header (type, next). + /// Correct values will be generated automatically based on `cap_data.id()` and + /// `cap_data.len()`. pub fn add_capability(&mut self, cap_data: &dyn PciCapability) -> Result { - let total_len = cap_data.bytes().len(); - // Check that the length is valid. - if cap_data.bytes().is_empty() { - return Err(Error::CapabilityEmpty); - } + let total_len = cap_data.bytes().len() + 2; let (cap_offset, tail_offset) = match self.last_capability { Some((offset, len)) => (Self::next_dword(offset, len), offset + 1), None => (FIRST_CAPABILITY_OFFSET, CAPABILITY_LIST_HEAD_OFFSET), @@ -1006,6 +999,7 @@ mod tests { use vm_memory::ByteValued; use super::*; + use crate::MsixCap; #[repr(C, packed)] #[derive(Clone, Copy, Default)] @@ -1028,6 +1022,28 @@ mod tests { } } + struct BadCap { + data: Vec, + } + + impl BadCap { + fn new(len: u8) -> Self { + Self { + data: (0..len).collect(), + } + } + } + + impl PciCapability for BadCap { + fn bytes(&self) -> &[u8] { + &self.data + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::VendorSpecific + } + } + #[test] fn add_capability() { let mut cfg = PciConfiguration::new( @@ -1044,6 +1060,20 @@ mod tests { None, ); + // Bad size capabilities + assert!(matches!( + cfg.add_capability(&BadCap::new(127)), + Err(Error::CapabilitySpaceFull(129)) + )); + cfg.add_capability(&BadCap::new(62)).unwrap(); + cfg.add_capability(&BadCap::new(62)).unwrap(); + assert!(matches!( + cfg.add_capability(&BadCap::new(0)), + Err(Error::CapabilitySpaceFull(2)) + )); + // Reset capabilities + cfg.last_capability = None; + // Add two capabilities with different contents. let cap1 = TestCap { len: 4, foo: 0xAA }; let cap1_offset = cfg.add_capability(&cap1).unwrap(); @@ -1074,6 +1104,107 @@ mod tests { assert_eq!((cap2_data >> 24) & 0xFF, 0x55); // cap2.foo } + #[test] + fn test_msix_capability() { + let mut cfg = PciConfiguration::new( + 0x1234, + 0x5678, + 0x1, + PciClassCode::MultimediaController, + &PciMultimediaSubclass::AudioController, + None, + PciHeaderType::Device, + 0xABCD, + 0x2468, + None, + None, + ); + + // Information about the MSI-X capability layout: https://wiki.osdev.org/PCI#Enabling_MSI-X + let msix_cap = MsixCap::new( + 3, // Using BAR3 for message control table + 1024, // 1024 MSI-X vectors + 0x4000, // Offset of message control table inside the BAR + 4, // BAR4 used for pending control bit + 0x420, // Offset of pending bit array (PBA) inside BAR + ); + cfg.add_capability(&msix_cap).unwrap(); + + let cap_reg = FIRST_CAPABILITY_OFFSET / 4; + let reg = cfg.read_reg(cap_reg); + // Capability ID is MSI-X + assert_eq!( + PciCapabilityId::from((reg & 0xff) as u8), + PciCapabilityId::MsiX + ); + // We only have one capability, so `next` should be 0 + assert_eq!(((reg >> 8) & 0xff) as u8, 0); + let msg_ctl = (reg >> 16) as u16; + + // MSI-X is enabled + assert_eq!(msg_ctl & 0x8000, 0x8000); + // Vectors are not masked + assert_eq!(msg_ctl & 0x4000, 0x0); + // Reserved bits are 0 + assert_eq!(msg_ctl & 0x3800, 0x0); + // We've got 1024 vectors (Table size is N-1 encoded) + assert_eq!((msg_ctl & 0x7ff) + 1, 1024); + + let reg = cfg.read_reg(cap_reg + 1); + // We are using BAR3 + assert_eq!(reg & 0x7, 3); + // Message Control Table is located in offset 0x4000 inside the BAR + // We don't need to shift. Offset needs to be 8-byte aligned - so BIR + // is stored in its last 3 bits (which we need to mask out). + assert_eq!(reg & 0xffff_fff8, 0x4000); + + let reg = cfg.read_reg(cap_reg + 2); + // PBA is 0x420 bytes inside BAR4 + assert_eq!(reg & 0x7, 4); + assert_eq!(reg & 0xffff_fff8, 0x420); + + // Check read/write mask + // Capability Id of MSI-X is 0x11 + cfg.write_config_register(cap_reg, 0, &[0x0]); + assert_eq!( + PciCapabilityId::from((cfg.read_reg(cap_reg) & 0xff) as u8), + PciCapabilityId::MsiX + ); + // Cannot override next capability pointer + cfg.write_config_register(cap_reg, 1, &[0x42]); + assert_eq!((cfg.read_reg(cap_reg) >> 8) & 0xff, 0); + + // We are writing this: + // + // meaning: | MSI enabled | Vectors Masked | Reserved | Table size | + // bit: | 15 | 14 | 13 - 11 | 0 - 10 | + // R/W: | R/W | R/W | R | R | + let msg_ctl = (cfg.read_reg(cap_reg) >> 16) as u16; + // Try to flip all bits + cfg.write_config_register(cap_reg, 2, &u16::to_le_bytes(!msg_ctl)); + let msg_ctl = (cfg.read_reg(cap_reg) >> 16) as u16; + // MSI enabled and Vectors masked should be flipped (MSI disabled and vectors masked) + assert_eq!(msg_ctl & 0xc000, 0x4000); + // Reserved bits should still be 0 + assert_eq!(msg_ctl & 0x3800, 0); + // Table size should not have changed + assert_eq!((msg_ctl & 0x07ff) + 1, 1024); + + // Table offset is read only + let table_offset = cfg.read_reg(cap_reg + 1); + // Try to flip all bits + cfg.write_config_register(cap_reg + 1, 0, &u32::to_le_bytes(!table_offset)); + // None should be flipped + assert_eq!(cfg.read_reg(cap_reg + 1), table_offset); + + // PBA offset also + let pba_offset = cfg.read_reg(cap_reg + 2); + // Try to flip all bits + cfg.write_config_register(cap_reg + 2, 0, &u32::to_le_bytes(!pba_offset)); + // None should be flipped + assert_eq!(cfg.read_reg(cap_reg + 2), pba_offset); + } + #[derive(Copy, Clone)] enum TestPi { Test = 0x5a, From c9bdb879d77006ad8190b96cb47617e911978f2e Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 5 Aug 2025 16:06:06 +0200 Subject: [PATCH 96/99] pci: add unit test for accesses to invalid registers Make sure we handle correctly accessing invalid registers. Signed-off-by: Babis Chalios --- src/pci/src/configuration.rs | 50 ++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/src/pci/src/configuration.rs b/src/pci/src/configuration.rs index 1f7c2e2ac2e..13fae13a9e4 100644 --- a/src/pci/src/configuration.rs +++ b/src/pci/src/configuration.rs @@ -831,6 +831,10 @@ impl PciConfiguration { } pub fn write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8]) { + if reg_idx >= NUM_CONFIGURATION_REGISTERS { + return; + } + if offset as usize + data.len() > 4 { return; } @@ -1401,4 +1405,50 @@ mod tests { assert_eq!(pci_config.get_bar_addr(0), 0x1000); assert_eq!(pci_config.get_bar_addr(2), 0x1_0000_0000); } + + #[test] + fn test_access_invalid_reg() { + let mut pci_config = PciConfiguration::new( + 0x42, + 0x0, + 0x0, + PciClassCode::MassStorage, + &PciMassStorageSubclass::SerialScsiController, + None, + PciHeaderType::Device, + 0x13, + 0x12, + None, + None, + ); + + // Can't read past the end of the configuration space + assert_eq!( + pci_config.read_reg(NUM_CONFIGURATION_REGISTERS), + 0xffff_ffff + ); + + // Read out all of configuration space + let config_space: Vec = (0..NUM_CONFIGURATION_REGISTERS) + .map(|reg_idx| pci_config.read_reg(reg_idx)) + .collect(); + + // Various invalid write accesses + + // Past the end of config space + pci_config.write_config_register(NUM_CONFIGURATION_REGISTERS, 0, &[0x42]); + pci_config.write_config_register(NUM_CONFIGURATION_REGISTERS, 0, &[0x42, 0x42]); + pci_config.write_config_register(NUM_CONFIGURATION_REGISTERS, 0, &[0x42, 0x42, 0x42, 0x42]); + + // Past register boundaries + pci_config.write_config_register(NUM_CONFIGURATION_REGISTERS, 1, &[0x42, 0x42, 0x42, 0x42]); + pci_config.write_config_register(NUM_CONFIGURATION_REGISTERS, 2, &[0x42, 0x42, 0x42]); + pci_config.write_config_register(NUM_CONFIGURATION_REGISTERS, 3, &[0x42, 0x42]); + pci_config.write_config_register(NUM_CONFIGURATION_REGISTERS, 4, &[0x42]); + pci_config.write_config_register(NUM_CONFIGURATION_REGISTERS, 5, &[]); + + for (reg_idx, reg) in config_space.iter().enumerate() { + assert_eq!(*reg, pci_config.read_reg(reg_idx)); + } + } } From 0f396a23bfe434dba9e27653e4f66643fb2df289 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 5 Aug 2025 17:12:17 +0200 Subject: [PATCH 97/99] pci: add unit test for BAR reprogramming detection Make sure we detect correctly valid intents to reprogram (move) BARs. Also, make sure we correctly ignore buggy ones. Signed-off-by: Babis Chalios --- src/pci/src/configuration.rs | 142 ++++++++++++++++++++++++++++++++++- src/pci/src/device.rs | 2 +- 2 files changed, 140 insertions(+), 4 deletions(-) diff --git a/src/pci/src/configuration.rs b/src/pci/src/configuration.rs index 13fae13a9e4..8ac78e191a7 100644 --- a/src/pci/src/configuration.rs +++ b/src/pci/src/configuration.rs @@ -924,9 +924,15 @@ impl PciConfiguration { region_type, }); } else if (reg_idx > BAR0_REG) - && ((self.registers[reg_idx - 1] & self.writable_bits[reg_idx - 1]) - != (self.bars[bar_idx - 1].addr & self.writable_bits[reg_idx - 1]) - || (value & mask) != (self.bars[bar_idx].addr & mask)) + && ( + // The lower BAR (of this 64bit BAR) has been reprogrammed to a different value + // than it used to be + (self.registers[reg_idx - 1] & self.writable_bits[reg_idx - 1]) + != (self.bars[bar_idx - 1].addr & self.writable_bits[reg_idx - 1]) || + // Or the lower BAR hasn't been changed but the upper one is being reprogrammed + // now to a different value + (value & mask) != (self.bars[bar_idx].addr & mask) + ) { info!( "Detected BAR reprogramming: (BAR {}) 0x{:x}->0x{:x}", @@ -1451,4 +1457,134 @@ mod tests { assert_eq!(*reg, pci_config.read_reg(reg_idx)); } } + + #[test] + fn test_detect_bar_reprogramming() { + let mut pci_config = PciConfiguration::new( + 0x42, + 0x0, + 0x0, + PciClassCode::MassStorage, + &PciMassStorageSubclass::SerialScsiController, + None, + PciHeaderType::Device, + 0x13, + 0x12, + None, + None, + ); + + // Trying to reprogram with something less than 4 bytes (length of the address) should fail + assert!(pci_config + .detect_bar_reprogramming(BAR0_REG, &[0x13]) + .is_none()); + assert!(pci_config + .detect_bar_reprogramming(BAR0_REG, &[0x13, 0x12]) + .is_none()); + assert!(pci_config + .detect_bar_reprogramming(BAR0_REG, &[0x13, 0x12]) + .is_none()); + assert!(pci_config + .detect_bar_reprogramming(BAR0_REG, &[0x13, 0x12, 0x16]) + .is_none()); + + // Writing all 1s is a special case where we're actually asking for the size of the BAR + assert!(pci_config + .detect_bar_reprogramming(BAR0_REG, &u32::to_le_bytes(0xffff_ffff)) + .is_none()); + + // Trying to reprogram a BAR that hasn't be initialized does nothing + for reg_idx in BAR0_REG..BAR0_REG + NUM_BAR_REGS { + assert!(pci_config + .detect_bar_reprogramming(reg_idx, &u32::to_le_bytes(0x1312_4243)) + .is_none()); + } + + // Reprogramming of a 32bit BAR + pci_config + .add_pci_bar(&PciBarConfiguration { + addr: 0x1000, + size: 0x1000, + idx: 0, + region_type: PciBarRegionType::Memory32BitRegion, + prefetchable: PciBarPrefetchable::Prefetchable, + }) + .unwrap(); + + assert_eq!( + pci_config.detect_bar_reprogramming(BAR0_REG, &u32::to_le_bytes(0x2000)), + Some(BarReprogrammingParams { + old_base: 0x1000, + new_base: 0x2000, + len: 0x1000, + region_type: PciBarRegionType::Memory32BitRegion + }) + ); + + pci_config.write_config_register(BAR0_REG, 0, &u32::to_le_bytes(0x2000)); + assert_eq!(pci_config.read_reg(BAR0_REG) & 0xffff_fff0, 0x2000); + + // Attempting to reprogram the BAR with the same address should not have any effect + assert!(pci_config + .detect_bar_reprogramming(BAR0_REG, &u32::to_le_bytes(0x2000)) + .is_none()); + + // Reprogramming of a 64bit BAR + pci_config + .add_pci_bar(&PciBarConfiguration { + addr: 0x13_1200_0000, + size: 0x8000, + idx: 1, + region_type: PciBarRegionType::Memory64BitRegion, + prefetchable: PciBarPrefetchable::Prefetchable, + }) + .unwrap(); + + assert_eq!(pci_config.read_reg(BAR0_REG + 1) & 0xffff_fff0, 0x1200_0000); + assert_eq!( + pci_config.bars[1].r#type, + Some(PciBarRegionType::Memory64BitRegion) + ); + assert_eq!(pci_config.read_reg(BAR0_REG + 2), 0x13); + assert!(pci_config.bars[2].r#type.is_none()); + + // First we write the lower 32 bits and this shouldn't cause any reprogramming + assert!(pci_config + .detect_bar_reprogramming(BAR0_REG + 1, &u32::to_le_bytes(0x4200_0000)) + .is_none()); + pci_config.write_config_register(BAR0_REG + 1, 0, &u32::to_le_bytes(0x4200_0000)); + + // Writing the upper 32 bits should trigger the reprogramming + assert_eq!( + pci_config.detect_bar_reprogramming(BAR0_REG + 2, &u32::to_le_bytes(0x84)), + Some(BarReprogrammingParams { + old_base: 0x13_1200_0000, + new_base: 0x84_4200_0000, + len: 0x8000, + region_type: PciBarRegionType::Memory64BitRegion + }) + ); + pci_config.write_config_register(BAR0_REG + 2, 0, &u32::to_le_bytes(0x84)); + + // Trying to reprogram the upper bits directly (without first touching the lower bits) + // should trigger a reprogramming + assert_eq!( + pci_config.detect_bar_reprogramming(BAR0_REG + 2, &u32::to_le_bytes(0x1312)), + Some(BarReprogrammingParams { + old_base: 0x84_4200_0000, + new_base: 0x1312_4200_0000, + len: 0x8000, + region_type: PciBarRegionType::Memory64BitRegion + }) + ); + pci_config.write_config_register(BAR0_REG + 2, 0, &u32::to_le_bytes(0x1312)); + + // Attempting to reprogram the BAR with the same address should not have any effect + assert!(pci_config + .detect_bar_reprogramming(BAR0_REG + 1, &u32::to_le_bytes(0x4200_0000)) + .is_none()); + assert!(pci_config + .detect_bar_reprogramming(BAR0_REG + 2, &u32::to_le_bytes(0x1312)) + .is_none()); + } } diff --git a/src/pci/src/device.rs b/src/pci/src/device.rs index ba84e6ec6bf..11db4f478a5 100644 --- a/src/pci/src/device.rs +++ b/src/pci/src/device.rs @@ -25,7 +25,7 @@ pub enum Error { } pub type Result = std::result::Result; -#[derive(Clone, Copy)] +#[derive(Clone, Copy, Debug, PartialEq, Eq)] pub struct BarReprogrammingParams { pub old_base: u64, pub new_base: u64, From dce075420ad3cbf3939545d296087be0a61e2cf4 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 6 Aug 2025 14:35:49 +0200 Subject: [PATCH 98/99] pci: add unit tests for PCI bus accesses Add a few unit tests to check the logic that accesses PCI configuration space via the PCI Bus. Ensure that negative cases are being handled properly. Signed-off-by: Babis Chalios --- src/pci/src/bus.rs | 462 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 441 insertions(+), 21 deletions(-) diff --git a/src/pci/src/bus.rs b/src/pci/src/bus.rs index 4f19360c097..01c9b1f1933 100644 --- a/src/pci/src/bus.rs +++ b/src/pci/src/bus.rs @@ -182,7 +182,7 @@ impl PciConfigIo { return None; } - let (bus, device, _function, register) = + let (bus, device, function, register) = parse_io_config_address(self.config_address & !0x8000_0000); // Only support one bus. @@ -190,6 +190,11 @@ impl PciConfigIo { return None; } + // Don't support multi-function devices. + if function > 0 { + return None; + } + let pci_bus = self.pci_bus.as_ref().lock().unwrap(); if let Some(d) = pci_bus.devices.get(&(device as u32)) { let mut device = d.lock().unwrap(); @@ -240,6 +245,16 @@ impl PciConfigIo { impl BusDevice for PciConfigIo { fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { + // Only allow reads to the register boundary. + let start = offset as usize % 4; + let end = start + data.len(); + if end > 4 { + for d in data.iter_mut() { + *d = 0xff; + } + return; + } + // `offset` is relative to 0xcf8 let value = match offset { 0..=3 => self.config_address, @@ -247,17 +262,8 @@ impl BusDevice for PciConfigIo { _ => 0xffff_ffff, }; - // Only allow reads to the register boundary. - let start = offset as usize % 4; - let end = start + data.len(); - if end <= 4 { - for i in start..end { - data[i - start] = (value >> (i * 8)) as u8; - } - } else { - for d in data { - *d = 0xff; - } + for i in start..end { + data[i - start] = (value >> (i * 8)) as u8; } } @@ -285,13 +291,18 @@ impl PciConfigMmio { } fn config_space_read(&self, config_address: u32) -> u32 { - let (bus, device, _function, register) = parse_mmio_config_address(config_address); + let (bus, device, function, register) = parse_mmio_config_address(config_address); // Only support one bus. if bus != 0 { return 0xffff_ffff; } + // Don't support multi-function devices. + if function > 0 { + return 0xffff_ffff; + } + self.pci_bus .lock() .unwrap() @@ -307,13 +318,18 @@ impl PciConfigMmio { return; } - let (bus, device, _function, register) = parse_mmio_config_address(config_address); + let (bus, device, function, register) = parse_mmio_config_address(config_address); // Only support one bus. if bus != 0 { return; } + // Don't support multi-function devices. + if function > 0 { + return; + } + let pci_bus = self.pci_bus.lock().unwrap(); if let Some(d) = pci_bus.devices.get(&(device as u32)) { let mut device = d.lock().unwrap(); @@ -415,14 +431,28 @@ fn parse_io_config_address(config_address: u32) -> (usize, usize, usize, usize) #[cfg(test)] mod tests { + use std::sync::atomic::AtomicUsize; use std::sync::{Arc, Mutex}; use vm_device::BusDevice; - use super::{PciBus, PciConfigIo, PciRoot}; - use crate::DeviceRelocation; + use super::{PciBus, PciConfigIo, PciConfigMmio, PciRoot}; + use crate::bus::{DEVICE_ID_INTEL_VIRT_PCIE_HOST, VENDOR_ID_INTEL}; + use crate::{ + DeviceRelocation, PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciClassCode, + PciConfiguration, PciDevice, PciHeaderType, PciMassStorageSubclass, + }; + + #[derive(Debug, Default)] + struct RelocationMock { + reloc_cnt: AtomicUsize, + } - struct RelocationMock; + impl RelocationMock { + fn cnt(&self) -> usize { + self.reloc_cnt.load(std::sync::atomic::Ordering::SeqCst) + } + } impl DeviceRelocation for RelocationMock { fn move_bar( @@ -433,13 +463,71 @@ mod tests { _pci_dev: &mut dyn crate::PciDevice, _region_type: crate::PciBarRegionType, ) -> std::result::Result<(), std::io::Error> { + self.reloc_cnt + .fetch_add(1, std::sync::atomic::Ordering::SeqCst); Ok(()) } } + struct PciDevMock(PciConfiguration); + + impl PciDevMock { + fn new() -> Self { + let mut config = PciConfiguration::new( + 0x42, + 0x0, + 0x0, + PciClassCode::MassStorage, + &PciMassStorageSubclass::SerialScsiController, + None, + PciHeaderType::Device, + 0x13, + 0x12, + None, + None, + ); + + config + .add_pci_bar(&PciBarConfiguration { + addr: 0x1000, + size: 0x1000, + idx: 0, + region_type: PciBarRegionType::Memory32BitRegion, + prefetchable: PciBarPrefetchable::Prefetchable, + }) + .unwrap(); + + PciDevMock(config) + } + } + + impl PciDevice for PciDevMock { + fn write_config_register( + &mut self, + reg_idx: usize, + offset: u64, + data: &[u8], + ) -> Option> { + self.0.write_config_register(reg_idx, offset, data); + None + } + + fn read_config_register(&mut self, reg_idx: usize) -> u32 { + self.0.read_reg(reg_idx) + } + + fn detect_bar_reprogramming( + &mut self, + reg_idx: usize, + data: &[u8], + ) -> Option { + self.0.detect_bar_reprogramming(reg_idx, data) + } + } + #[test] - fn test_writing_config_address() { - let mock = Arc::new(RelocationMock); + fn test_writing_io_config_address() { + let mock = Arc::new(RelocationMock::default()); let root = PciRoot::new(None); let mut bus = PciConfigIo::new(Arc::new(Mutex::new(PciBus::new(root, mock)))); @@ -480,8 +568,8 @@ mod tests { } #[test] - fn test_reading_config_address() { - let mock = Arc::new(RelocationMock); + fn test_reading_io_config_address() { + let mock = Arc::new(RelocationMock::default()); let root = PciRoot::new(None); let mut bus = PciConfigIo::new(Arc::new(Mutex::new(PciBus::new(root, mock)))); @@ -527,4 +615,336 @@ mod tests { bus.read(0, 0, &mut buffer); assert_eq!(buffer, [0x45, 0x44, 0x43, 0x42]); } + + fn initialize_bus() -> (PciConfigMmio, PciConfigIo, Arc) { + let mock = Arc::new(RelocationMock::default()); + let root = PciRoot::new(None); + let mut bus = PciBus::new(root, mock.clone()); + bus.add_device(1, Arc::new(Mutex::new(PciDevMock::new()))) + .unwrap(); + let bus = Arc::new(Mutex::new(bus)); + (PciConfigMmio::new(bus.clone()), PciConfigIo::new(bus), mock) + } + + #[test] + fn test_invalid_register_boundary_reads() { + let (mut mmio_config, mut io_config, _) = initialize_bus(); + + // Read crossing register boundaries + let mut buffer = [0u8; 4]; + mmio_config.read(0, 1, &mut buffer); + assert_eq!(0xffff_ffff, u32::from_le_bytes(buffer)); + + let mut buffer = [0u8; 4]; + io_config.read(0, 1, &mut buffer); + assert_eq!(0xffff_ffff, u32::from_le_bytes(buffer)); + + // As well in the config space + let mut buffer = [0u8; 4]; + io_config.read(0, 5, &mut buffer); + assert_eq!(0xffff_ffff, u32::from_le_bytes(buffer)); + } + + // MMIO config addresses are of the form + // + // | Base address upper bits | Bus Number | Device Number | Function Number | Register number | Byte offset | + // | 31-28 | 27-20 | 19-15 | 14-12 | 11-2 | 0-1 | + // + // Meaning that the offset is built using: + // + // `bus << 20 | device << 15 | function << 12 | register << 2 | byte` + fn mmio_offset(bus: u8, device: u8, function: u8, register: u16, byte: u8) -> u32 { + assert!(device < 32); + assert!(function < 8); + assert!(register < 1024); + assert!(byte < 4); + + (bus as u32) << 20 + | (device as u32) << 15 + | (function as u32) << 12 + | (register as u32) << 2 + | (byte as u32) + } + + fn read_mmio_config( + config: &mut PciConfigMmio, + bus: u8, + device: u8, + function: u8, + register: u16, + byte: u8, + data: &mut [u8], + ) { + config.read( + 0, + mmio_offset(bus, device, function, register, byte) as u64, + data, + ); + } + + fn write_mmio_config( + config: &mut PciConfigMmio, + bus: u8, + device: u8, + function: u8, + register: u16, + byte: u8, + data: &[u8], + ) { + config.write( + 0, + mmio_offset(bus, device, function, register, byte) as u64, + data, + ); + } + + // Similarly, when using the IO mechanism the config addresses have the following format + // + // | Enabled | zeros | Bus Number | Device Number | Function Number | Register number | zeros | + // | 31 | 30-24 | 23-16 | 15-11 | 10-8 | 7-2 | 1-0 | + // + // + // Meaning that the address is built using: + // + // 0x8000_0000 | bus << 16 | device << 11 | function << 8 | register << 2; + // + // Only 32-bit aligned accesses are allowed here. + fn pio_offset(enabled: bool, bus: u8, device: u8, function: u8, register: u8) -> u32 { + assert!(device < 32); + assert!(function < 8); + assert!(register < 64); + + let offset = if enabled { 0x8000_0000 } else { 0u32 }; + + offset + | (bus as u32) << 16 + | (device as u32) << 11 + | (function as u32) << 8 + | (register as u32) << 2 + } + + fn set_io_address( + config: &mut PciConfigIo, + enabled: bool, + bus: u8, + device: u8, + function: u8, + register: u8, + ) { + let address = u32::to_le_bytes(pio_offset(enabled, bus, device, function, register)); + config.write(0, 0, &address); + } + + fn read_io_config( + config: &mut PciConfigIo, + enabled: bool, + bus: u8, + device: u8, + function: u8, + register: u8, + data: &mut [u8], + ) { + set_io_address(config, enabled, bus, device, function, register); + config.read(0, 4, data); + } + + fn write_io_config( + config: &mut PciConfigIo, + enabled: bool, + bus: u8, + device: u8, + function: u8, + register: u8, + data: &[u8], + ) { + set_io_address(config, enabled, bus, device, function, register); + config.write(0, 4, data); + } + + #[test] + fn test_mmio_invalid_bus_number() { + let (mut mmio_config, _, _) = initialize_bus(); + let mut buffer = [0u8; 4]; + + // Asking for Bus 1 should return all 1s + read_mmio_config(&mut mmio_config, 1, 0, 0, 0, 0, &mut buffer); + assert_eq!(buffer, u32::to_le_bytes(0xffff_ffff)); + // Writing the same + buffer[0] = 0x42; + write_mmio_config(&mut mmio_config, 1, 0, 0, 15, 0, &buffer); + read_mmio_config(&mut mmio_config, 1, 0, 0, 15, 0, &mut buffer); + assert_eq!(buffer, u32::to_le_bytes(0xffff_ffff)); + read_mmio_config(&mut mmio_config, 0, 0, 0, 15, 0, &mut buffer); + assert_eq!(buffer, u32::to_le_bytes(0x0)); + + // Asking for Bus 0 should work + read_mmio_config(&mut mmio_config, 0, 0, 0, 0, 0, &mut buffer); + assert_eq!(&buffer[..2], &u16::to_le_bytes(VENDOR_ID_INTEL)); + assert_eq!( + &buffer[2..], + &u16::to_le_bytes(DEVICE_ID_INTEL_VIRT_PCIE_HOST) + ); + } + + #[test] + fn test_io_invalid_bus_number() { + let (_, mut pio_config, _) = initialize_bus(); + let mut buffer = [0u8; 4]; + + // Asking for Bus 1 should return all 1s + read_io_config(&mut pio_config, true, 1, 0, 0, 0, &mut buffer); + assert_eq!(buffer, u32::to_le_bytes(0xffff_ffff)); + + // Asking for Bus 0 should work + read_io_config(&mut pio_config, true, 0, 0, 0, 0, &mut buffer); + assert_eq!(&buffer[..2], &u16::to_le_bytes(VENDOR_ID_INTEL)); + assert_eq!( + &buffer[2..], + &u16::to_le_bytes(DEVICE_ID_INTEL_VIRT_PCIE_HOST) + ); + } + + #[test] + fn test_mmio_invalid_function() { + let (mut mmio_config, _, _) = initialize_bus(); + let mut buffer = [0u8; 4]; + + // Asking for Bus 1 should return all 1s + read_mmio_config(&mut mmio_config, 0, 0, 1, 0, 0, &mut buffer); + assert_eq!(buffer, u32::to_le_bytes(0xffff_ffff)); + // Writing the same + buffer[0] = 0x42; + write_mmio_config(&mut mmio_config, 0, 0, 1, 15, 0, &buffer); + read_mmio_config(&mut mmio_config, 0, 0, 1, 15, 0, &mut buffer); + assert_eq!(buffer, u32::to_le_bytes(0xffff_ffff)); + read_mmio_config(&mut mmio_config, 0, 0, 0, 15, 0, &mut buffer); + assert_eq!(buffer, u32::to_le_bytes(0x0)); + + // Asking for Bus 0 should work + read_mmio_config(&mut mmio_config, 0, 0, 0, 0, 0, &mut buffer); + assert_eq!(&buffer[..2], &u16::to_le_bytes(VENDOR_ID_INTEL)); + assert_eq!( + &buffer[2..], + &u16::to_le_bytes(DEVICE_ID_INTEL_VIRT_PCIE_HOST) + ); + } + + #[test] + fn test_io_invalid_function() { + let (_, mut pio_config, _) = initialize_bus(); + let mut buffer = [0u8; 4]; + + // Asking for Bus 1 should return all 1s + read_io_config(&mut pio_config, true, 0, 0, 1, 0, &mut buffer); + assert_eq!(buffer, u32::to_le_bytes(0xffff_ffff)); + + // Asking for Bus 0 should work + read_io_config(&mut pio_config, true, 0, 0, 0, 0, &mut buffer); + assert_eq!(&buffer[..2], &u16::to_le_bytes(VENDOR_ID_INTEL)); + assert_eq!( + &buffer[2..], + &u16::to_le_bytes(DEVICE_ID_INTEL_VIRT_PCIE_HOST) + ); + } + + #[test] + fn test_io_disabled_reads() { + let (_, mut pio_config, _) = initialize_bus(); + let mut buffer = [0u8; 4]; + + // Trying to read without enabling should return all 1s + read_io_config(&mut pio_config, false, 0, 0, 0, 0, &mut buffer); + assert_eq!(buffer, u32::to_le_bytes(0xffff_ffff)); + + // Asking for Bus 0 should work + read_io_config(&mut pio_config, true, 0, 0, 0, 0, &mut buffer); + assert_eq!(&buffer[..2], &u16::to_le_bytes(VENDOR_ID_INTEL)); + assert_eq!( + &buffer[2..], + &u16::to_le_bytes(DEVICE_ID_INTEL_VIRT_PCIE_HOST) + ); + } + + #[test] + fn test_io_disabled_writes() { + let (_, mut pio_config, _) = initialize_bus(); + + // Try to write the IRQ line used for the root port. + let mut buffer = [0u8; 4]; + + // First read the current value (use `enabled` bit) + read_io_config(&mut pio_config, true, 0, 0, 0, 15, &mut buffer); + let irq_line = buffer[0]; + + // Write without setting the `enabled` bit. + buffer[0] = 0x42; + write_io_config(&mut pio_config, false, 0, 0, 0, 15, &buffer); + + // IRQ line shouldn't have changed + read_io_config(&mut pio_config, true, 0, 0, 0, 15, &mut buffer); + assert_eq!(buffer[0], irq_line); + + // Write with `enabled` bit set. + buffer[0] = 0x42; + write_io_config(&mut pio_config, true, 0, 0, 0, 15, &buffer); + + // IRQ line should change + read_io_config(&mut pio_config, true, 0, 0, 0, 15, &mut buffer); + assert_eq!(buffer[0], 0x42); + } + + #[test] + fn test_mmio_writes() { + let (mut mmio_config, _, _) = initialize_bus(); + let mut buffer = [0u8; 4]; + + read_mmio_config(&mut mmio_config, 0, 0, 0, 15, 0, &mut buffer); + assert_eq!(buffer[0], 0x0); + write_mmio_config(&mut mmio_config, 0, 0, 0, 15, 0, &[0x42]); + read_mmio_config(&mut mmio_config, 0, 0, 0, 15, 0, &mut buffer); + assert_eq!(buffer[0], 0x42); + } + + #[test] + fn test_bar_reprogramming() { + let (mut mmio_config, _, mock) = initialize_bus(); + let mut buffer = [0u8; 4]; + assert_eq!(mock.cnt(), 0); + + read_mmio_config(&mut mmio_config, 0, 1, 0, 0x4, 0, &mut buffer); + let old_addr = u32::from_le_bytes(buffer) & 0xffff_fff0; + assert_eq!(old_addr, 0x1000); + write_mmio_config( + &mut mmio_config, + 0, + 1, + 0, + 0x4, + 0, + &u32::to_le_bytes(0x1312_1110), + ); + + read_mmio_config(&mut mmio_config, 0, 1, 0, 0x4, 0, &mut buffer); + let new_addr = u32::from_le_bytes(buffer) & 0xffff_fff0; + assert_eq!(new_addr, 0x1312_1110); + assert_eq!(mock.cnt(), 1); + + // BAR1 should not be used, so reading its address should return all 0s + read_mmio_config(&mut mmio_config, 0, 1, 0, 0x5, 0, &mut buffer); + assert_eq!(buffer, [0x0, 0x0, 0x0, 0x0]); + + // and reprogramming shouldn't have any effect + write_mmio_config( + &mut mmio_config, + 0, + 1, + 0, + 0x5, + 0, + &u32::to_le_bytes(0x1312_1110), + ); + + read_mmio_config(&mut mmio_config, 0, 1, 0, 0x5, 0, &mut buffer); + assert_eq!(buffer, [0x0, 0x0, 0x0, 0x0]); + } } From 75eb51d5eebda6f371713fb9da1be443fc35c6a1 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Thu, 7 Aug 2025 16:02:22 +0200 Subject: [PATCH 99/99] pci: add unit tests for MSI-X code Also, drop some of effectively dead code that Cloud Hypervisor was using because they were not relying on KVM to handle interrupt controllers. Finally, fixup some error cases on guest reads which need to return all-ones when bad accesses happen. Signed-off-by: Babis Chalios --- Cargo.lock | 1 + src/pci/Cargo.toml | 1 + src/pci/src/configuration.rs | 3 + src/pci/src/lib.rs | 5 +- src/pci/src/msix.rs | 495 +++++++++++++++--- .../devices/virtio/transport/pci/device.rs | 2 +- 6 files changed, 417 insertions(+), 90 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 339ab721674..899cdb112b9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1053,6 +1053,7 @@ dependencies = [ "vm-allocator", "vm-device", "vm-memory", + "vmm-sys-util", ] [[package]] diff --git a/src/pci/Cargo.toml b/src/pci/Cargo.toml index a7ef102acfb..17dc30fcd6d 100644 --- a/src/pci/Cargo.toml +++ b/src/pci/Cargo.toml @@ -27,3 +27,4 @@ vm-memory = { version = "0.16.1", features = [ [dev-dependencies] serde_test = "1.0.177" +vmm-sys-util = "0.14.0" diff --git a/src/pci/src/configuration.rs b/src/pci/src/configuration.rs index 8ac78e191a7..fd1e3958ec8 100644 --- a/src/pci/src/configuration.rs +++ b/src/pci/src/configuration.rs @@ -843,11 +843,14 @@ impl PciConfiguration { if let Some(msix_cap_reg_idx) = self.msix_cap_reg_idx { if let Some(msix_config) = &self.msix_config { if msix_cap_reg_idx == reg_idx && offset == 2 && data.len() == 2 { + // 2-bytes write in the Message Control field msix_config .lock() .unwrap() .set_msg_ctl(LittleEndian::read_u16(data)); } else if msix_cap_reg_idx == reg_idx && offset == 0 && data.len() == 4 { + // 4 bytes write at the beginning. Ignore the first 2 bytes which are the + // capability id and next capability pointer msix_config .lock() .unwrap() diff --git a/src/pci/src/lib.rs b/src/pci/src/lib.rs index 1b9a3a99f76..83f5a7a5dcf 100644 --- a/src/pci/src/lib.rs +++ b/src/pci/src/lib.rs @@ -30,10 +30,7 @@ pub use self::configuration::{ pub use self::device::{ BarReprogrammingParams, DeviceRelocation, Error as PciDeviceError, PciDevice, }; -pub use self::msix::{ - Error as MsixError, MsixCap, MsixConfig, MsixConfigState, MsixTableEntry, MSIX_CONFIG_ID, - MSIX_TABLE_ENTRY_SIZE, -}; +pub use self::msix::{Error as MsixError, MsixCap, MsixConfig, MsixConfigState, MsixTableEntry}; /// PCI has four interrupt pins A->D. #[derive(Copy, Clone)] diff --git a/src/pci/src/msix.rs b/src/pci/src/msix.rs index 82f851322b4..50abeaf9737 100644 --- a/src/pci/src/msix.rs +++ b/src/pci/src/msix.rs @@ -21,10 +21,6 @@ const MSIX_PBA_ENTRIES_MODULO: u64 = 8; const BITS_PER_PBA_ENTRY: usize = 64; const FUNCTION_MASK_BIT: u8 = 14; const MSIX_ENABLE_BIT: u8 = 15; -const FUNCTION_MASK_MASK: u16 = (1 << FUNCTION_MASK_BIT) as u16; -const MSIX_ENABLE_MASK: u16 = (1 << MSIX_ENABLE_BIT) as u16; -pub const MSIX_TABLE_ENTRY_SIZE: usize = 16; -pub const MSIX_CONFIG_ID: &str = "msix_config"; #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum Error { @@ -72,8 +68,8 @@ pub struct MsixConfig { pub pba_entries: Vec, pub devid: u32, pub interrupt_source_group: Arc, - masked: bool, - enabled: bool, + pub masked: bool, + pub enabled: bool, } impl std::fmt::Debug for MsixConfig { @@ -136,7 +132,7 @@ impl MsixConfig { let mut table_entries: Vec = Vec::new(); table_entries.resize_with(msix_vectors as usize, Default::default); let mut pba_entries: Vec = Vec::new(); - let num_pba_entries: usize = ((msix_vectors as usize) / BITS_PER_PBA_ENTRY) + 1; + let num_pba_entries: usize = (msix_vectors as usize).div_ceil(BITS_PER_PBA_ENTRY); pba_entries.resize_with(num_pba_entries, Default::default); (table_entries, pba_entries, true, false) @@ -161,14 +157,6 @@ impl MsixConfig { } } - pub fn masked(&self) -> bool { - self.masked - } - - pub fn enabled(&self) -> bool { - self.enabled - } - pub fn set_msg_ctl(&mut self, reg: u16) { let old_masked = self.masked; let old_enabled = self.enabled; @@ -225,8 +213,8 @@ impl MsixConfig { let modulo_offset = offset % MSIX_TABLE_ENTRIES_MODULO; if index >= self.table_entries.len() { - debug!("Invalid MSI-X table entry index {index}"); - data.copy_from_slice(&[0xff; 8][..data.len()]); + warn!("Invalid MSI-X table entry index {index}"); + data.fill(0xff); return; } @@ -237,13 +225,12 @@ impl MsixConfig { 0x4 => self.table_entries[index].msg_addr_hi, 0x8 => self.table_entries[index].msg_data, 0xc => self.table_entries[index].vector_ctl, - _ => { - error!("invalid offset"); - 0 + off => { + warn!("msi-x: invalid offset in table entry read: {off}"); + 0xffff_ffff } }; - debug!("MSI_R TABLE offset 0x{:x} data 0x{:x}", offset, value); LittleEndian::write_u32(data, value); } 8 => { @@ -256,17 +243,17 @@ impl MsixConfig { (u64::from(self.table_entries[index].vector_ctl) << 32) | u64::from(self.table_entries[index].msg_data) } - _ => { - error!("invalid offset"); - 0 + off => { + warn!("msi-x: invalid offset in table entry read: {off}"); + 0xffff_ffff_ffff_ffff } }; - debug!("MSI_R TABLE offset 0x{:x} data 0x{:x}", offset, value); LittleEndian::write_u64(data, value); } - _ => { - error!("invalid data length"); + len => { + warn!("msi-x: invalid length in table entry read: {len}"); + data.fill(0xff); } } } @@ -278,7 +265,7 @@ impl MsixConfig { let modulo_offset = offset % MSIX_TABLE_ENTRIES_MODULO; if index >= self.table_entries.len() { - debug!("Invalid MSI-X table entry index {index}"); + warn!("msi-x: invalid table entry index {index}"); return; } @@ -295,10 +282,8 @@ impl MsixConfig { 0xc => { self.table_entries[index].vector_ctl = value; } - _ => error!("invalid offset"), + off => warn!("msi-x: invalid offset in table entry write: {off}"), }; - - debug!("MSI_W TABLE offset 0x{:x} data 0x{:x}", offset, value); } 8 => { let value = LittleEndian::read_u64(data); @@ -311,12 +296,10 @@ impl MsixConfig { self.table_entries[index].msg_data = (value & 0xffff_ffffu64) as u32; self.table_entries[index].vector_ctl = (value >> 32) as u32; } - _ => error!("invalid offset"), + off => warn!("msi-x: invalid offset in table entry write: {off}"), }; - - debug!("MSI_W TABLE offset 0x{:x} data 0x{:x}", offset, value); } - _ => error!("invalid data length"), + len => warn!("msi-x: invalid length in table entry write: {len}"), }; let table_entry = &self.table_entries[index]; @@ -329,7 +312,7 @@ impl MsixConfig { // Update interrupt routes // Optimisation: only update routes if the entry is not masked; // this is safe because if the entry is masked (starts masked as per spec) - // in the table then it won't be triggered. (See: #4273) + // in the table then it won't be triggered. if self.enabled && !self.masked && !table_entry.masked() { let config = MsiIrqSourceConfig { high_addr: table_entry.msg_addr_hi, @@ -357,8 +340,8 @@ impl MsixConfig { // device. // Check if bit has been flipped - if !self.masked() - && self.enabled() + if !self.masked + && self.enabled && old_entry.masked() && !table_entry.masked() && self.get_pba_bit(index as u16) == 1 @@ -367,15 +350,13 @@ impl MsixConfig { } } - pub fn read_pba(&mut self, offset: u64, data: &mut [u8]) { - assert!(data.len() <= 8); - + pub fn read_pba(&self, offset: u64, data: &mut [u8]) { let index: usize = (offset / MSIX_PBA_ENTRIES_MODULO) as usize; let modulo_offset = offset % MSIX_PBA_ENTRIES_MODULO; if index >= self.pba_entries.len() { - debug!("Invalid MSI-X PBA entry index {index}"); - data.copy_from_slice(&[0xff; 8][..data.len()]); + warn!("msi-x: invalid PBA entry index {index}"); + data.fill(0xff); return; } @@ -384,29 +365,28 @@ impl MsixConfig { let value: u32 = match modulo_offset { 0x0 => (self.pba_entries[index] & 0xffff_ffffu64) as u32, 0x4 => (self.pba_entries[index] >> 32) as u32, - _ => { - error!("invalid offset"); - 0 + off => { + warn!("msi-x: invalid offset in pba entry read: {off}"); + 0xffff_ffff } }; - debug!("MSI_R PBA offset 0x{:x} data 0x{:x}", offset, value); LittleEndian::write_u32(data, value); } 8 => { let value: u64 = match modulo_offset { 0x0 => self.pba_entries[index], - _ => { - error!("invalid offset"); - 0 + off => { + warn!("msi-x: invalid offset in pba entry read: {off}"); + 0xffff_ffff_ffff_ffff } }; - debug!("MSI_R PBA offset 0x{:x} data 0x{:x}", offset, value); LittleEndian::write_u64(data, value); } - _ => { - error!("invalid data length"); + len => { + warn!("msi-x: invalid length in table entry read: {len}"); + data.fill(0xff); } } } @@ -418,9 +398,13 @@ impl MsixConfig { pub fn set_pba_bit(&mut self, vector: u16, reset: bool) { assert!(vector < MAX_MSIX_VECTORS_PER_DEVICE); + if (vector as usize) >= self.table_entries.len() { + return; + } + let index: usize = (vector as usize) / BITS_PER_PBA_ENTRY; let shift: usize = (vector as usize) % BITS_PER_PBA_ENTRY; - let mut mask: u64 = (1 << shift) as u64; + let mut mask: u64 = 1u64 << shift; if reset { mask = !mask; @@ -433,6 +417,10 @@ impl MsixConfig { fn get_pba_bit(&self, vector: u16) -> u8 { assert!(vector < MAX_MSIX_VECTORS_PER_DEVICE); + if (vector as usize) >= self.table_entries.len() { + return 0xff; + } + let index: usize = (vector as usize) / BITS_PER_PBA_ENTRY; let shift: usize = (vector as usize) % BITS_PER_PBA_ENTRY; @@ -506,59 +494,396 @@ impl MsixCap { pba: (pba_off & 0xffff_fff8u32) | u32::from(pba_pci_bar & 0x7u8), } } +} + +#[cfg(test)] +mod tests { + use std::sync::atomic::{AtomicUsize, Ordering}; + + use vmm_sys_util::eventfd::EventFd; - pub fn set_msg_ctl(&mut self, data: u16) { - self.msg_ctl = (self.msg_ctl & !(FUNCTION_MASK_MASK | MSIX_ENABLE_MASK)) - | (data & (FUNCTION_MASK_MASK | MSIX_ENABLE_MASK)); + use super::*; + + #[derive(Debug)] + struct MockInterrupt { + trigger_cnt: [AtomicUsize; 2], + update_cnt: [AtomicUsize; 2], + event_fd: [EventFd; 2], } - pub fn masked(&self) -> bool { - (self.msg_ctl >> FUNCTION_MASK_BIT) & 0x1 == 0x1 + impl MockInterrupt { + fn new() -> Self { + MockInterrupt { + trigger_cnt: [AtomicUsize::new(0), AtomicUsize::new(0)], + update_cnt: [AtomicUsize::new(0), AtomicUsize::new(0)], + event_fd: [ + EventFd::new(libc::EFD_NONBLOCK).unwrap(), + EventFd::new(libc::EFD_NONBLOCK).unwrap(), + ], + } + } + + fn interrupt_cnt(&self, index: InterruptIndex) -> usize { + self.trigger_cnt[index as usize].load(Ordering::SeqCst) + } + + fn update_cnt(&self, index: InterruptIndex) -> usize { + self.update_cnt[index as usize].load(Ordering::SeqCst) + } } - pub fn enabled(&self) -> bool { - (self.msg_ctl >> MSIX_ENABLE_BIT) & 0x1 == 0x1 + impl InterruptSourceGroup for MockInterrupt { + fn trigger(&self, index: InterruptIndex) -> vm_device::interrupt::Result<()> { + self.trigger_cnt[index as usize].fetch_add(1, Ordering::SeqCst); + Ok(()) + } + + fn notifier(&self, index: InterruptIndex) -> Option<&EventFd> { + self.event_fd.get(index as usize) + } + + fn update( + &self, + index: InterruptIndex, + _config: InterruptSourceConfig, + _masked: bool, + _set_gsi: bool, + ) -> vm_device::interrupt::Result<()> { + self.update_cnt[index as usize].fetch_add(1, Ordering::SeqCst); + Ok(()) + } + + fn set_gsi(&self) -> vm_device::interrupt::Result<()> { + Ok(()) + } } - pub fn table_offset(&self) -> u32 { - self.table & 0xffff_fff8 + #[test] + #[should_panic] + fn test_too_many_vectors() { + MsixConfig::new(2049, Arc::new(MockInterrupt::new()), 0x42, None).unwrap(); } - pub fn pba_offset(&self) -> u32 { - self.pba & 0xffff_fff8 + #[test] + fn test_new_msix_config() { + let vectors = Arc::new(MockInterrupt::new()); + let config = MsixConfig::new(2, vectors.clone(), 0x42, None).unwrap(); + assert_eq!(config.devid, 0x42); + assert!(config.masked); + assert!(!config.enabled); + assert_eq!(config.table_entries.len(), 2); + assert_eq!(config.pba_entries.len(), 1); } - pub fn table_set_offset(&mut self, addr: u32) { - self.table &= 0x7; - self.table += addr; + #[test] + fn test_enable_msix_vectors() { + let vectors = Arc::new(MockInterrupt::new()); + let mut config = MsixConfig::new(2, vectors.clone(), 0x42, None).unwrap(); + + assert!(!config.enabled); + assert!(config.masked); + + // Bit 15 marks whether MSI-X is enabled + // Bit 14 marks whether vectors are masked + config.set_msg_ctl(0x8000); + assert!(config.enabled); + assert!(!config.masked); + + config.set_msg_ctl(0x4000); + assert!(!config.enabled); + assert!(config.masked); + + config.set_msg_ctl(0xC000); + assert!(config.enabled); + assert!(config.masked); + + config.set_msg_ctl(0x0); + assert!(!config.enabled); + assert!(!config.masked); } - pub fn pba_set_offset(&mut self, addr: u32) { - self.pba &= 0x7; - self.pba += addr; + #[test] + #[should_panic] + fn test_table_access_read_too_big() { + let vectors = Arc::new(MockInterrupt::new()); + let config = MsixConfig::new(2, vectors.clone(), 0x42, None).unwrap(); + let mut buffer = [0u8; 16]; + + config.read_table(0, &mut buffer); } - pub fn table_bir(&self) -> u32 { - self.table & 0x7 + #[test] + fn test_read_table_past_end() { + let vectors = Arc::new(MockInterrupt::new()); + let config = MsixConfig::new(2, vectors.clone(), 0x42, None).unwrap(); + let mut buffer = [0u8; 8]; + + // We have 2 vectors (16 bytes each), so we should be able to read up to 32 bytes. + // Past that the device should respond with all 1s + config.read_table(32, &mut buffer); + assert_eq!(buffer, [0xff; 8]); } - pub fn pba_bir(&self) -> u32 { - self.pba & 0x7 + #[test] + fn test_read_table_bad_length() { + let vectors = Arc::new(MockInterrupt::new()); + let config = MsixConfig::new(2, vectors.clone(), 0x42, None).unwrap(); + let mut buffer = [0u8; 8]; + + // We can either read 4 or 8 bytes + config.read_table(0, &mut buffer[..0]); + assert_eq!(buffer, [0x0; 8]); + config.read_table(0, &mut buffer[..1]); + assert_eq!(buffer[..1], [0xff; 1]); + config.read_table(0, &mut buffer[..2]); + assert_eq!(buffer[..2], [0xff; 2]); + config.read_table(0, &mut buffer[..3]); + assert_eq!(buffer[..3], [0xff; 3]); + config.read_table(0, &mut buffer[..5]); + assert_eq!(buffer[..5], [0xff; 5]); + config.read_table(0, &mut buffer[..6]); + assert_eq!(buffer[..6], [0xff; 6]); + config.read_table(0, &mut buffer[..7]); + assert_eq!(buffer[..7], [0xff; 7]); + config.read_table(0, &mut buffer[..4]); + assert_eq!(buffer, u64::to_le_bytes(0x00ff_ffff_0000_0000)); + config.read_table(0, &mut buffer); + assert_eq!(buffer, u64::to_le_bytes(0)); } - pub fn table_size(&self) -> u16 { - (self.msg_ctl & 0x7ff) + 1 + #[test] + fn test_access_table() { + let vectors = Arc::new(MockInterrupt::new()); + let mut config = MsixConfig::new(2, vectors.clone(), 0x42, None).unwrap(); + // enabled and not masked + config.set_msg_ctl(0x8000); + assert_eq!(vectors.update_cnt(0), 1); + assert_eq!(vectors.update_cnt(1), 1); + let mut buffer = [0u8; 8]; + + // Write first vector's address with a single 8-byte write + config.write_table(0, &u64::to_le_bytes(0x0000_1312_0000_1110)); + // It's still masked so shouldn't be updated + assert_eq!(vectors.update_cnt(0), 1); + assert_eq!(vectors.update_cnt(1), 1); + // Same for control and message data + config.write_table(8, &u64::to_le_bytes(0x0_0000_0020)); + // Now, we enabled it, so we should see an update + assert_eq!(vectors.update_cnt(0), 2); + assert_eq!(vectors.update_cnt(1), 1); + + // Write second vector's fields with 4-byte writes + // low 32 bits of the address + config.write_table(16, &u32::to_le_bytes(0x4241)); + assert_eq!(vectors.update_cnt(0), 2); + // Still masked + assert_eq!(vectors.update_cnt(1), 1); + // high 32 bits of the address + config.write_table(20, &u32::to_le_bytes(0x4443)); + assert_eq!(vectors.update_cnt(0), 2); + // Still masked + assert_eq!(vectors.update_cnt(1), 1); + // message data + config.write_table(24, &u32::to_le_bytes(0x21)); + assert_eq!(vectors.update_cnt(0), 2); + // Still masked + assert_eq!(vectors.update_cnt(1), 1); + // vector control + config.write_table(28, &u32::to_le_bytes(0x0)); + assert_eq!(vectors.update_cnt(0), 2); + assert_eq!(vectors.update_cnt(1), 2); + + assert_eq!(config.table_entries[0].msg_addr_hi, 0x1312); + assert_eq!(config.table_entries[0].msg_addr_lo, 0x1110); + assert_eq!(config.table_entries[0].msg_data, 0x20); + assert_eq!(config.table_entries[0].vector_ctl, 0); + + assert_eq!(config.table_entries[1].msg_addr_hi, 0x4443); + assert_eq!(config.table_entries[1].msg_addr_lo, 0x4241); + assert_eq!(config.table_entries[1].msg_data, 0x21); + assert_eq!(config.table_entries[1].vector_ctl, 0); + + assert_eq!(config.table_entries.len(), 2); + assert_eq!(config.pba_entries.len(), 1); + + // reading at a bad offset should return all 1s + config.read_table(1, &mut buffer[..4]); + assert_eq!(buffer[..4], [0xff; 4]); + // read low address for first vector + config.read_table(0, &mut buffer[..4]); + assert_eq!( + buffer[..4], + u32::to_le_bytes(config.table_entries[0].msg_addr_lo) + ); + // read the high address for first vector + config.read_table(4, &mut buffer[4..]); + assert_eq!(0x0000_1312_0000_1110, u64::from_le_bytes(buffer)); + // read msg_data from second vector + config.read_table(24, &mut buffer[..4]); + assert_eq!(u32::to_le_bytes(0x21), &buffer[..4]); + // read vector control for second vector + config.read_table(28, &mut buffer[..4]); + assert_eq!(u32::to_le_bytes(0x0), &buffer[..4]); + + // reading with 8 bytes at bad offset should also return all 1s + config.read_table(19, &mut buffer); + assert_eq!(buffer, [0xff; 8]); + + // Read the second vector's address using an 8 byte read + config.read_table(16, &mut buffer); + assert_eq!(0x0000_4443_0000_4241, u64::from_le_bytes(buffer)); + + // Read the first vector's ctrl and data with a single 8 byte read + config.read_table(8, &mut buffer); + assert_eq!(0x0_0000_0020, u64::from_le_bytes(buffer)); + + // If we mask the interrupts we shouldn't see any update + config.write_table(12, &u32::to_le_bytes(0x1)); + config.write_table(28, &u32::to_le_bytes(0x1)); + assert_eq!(vectors.update_cnt(0), 2); + assert_eq!(vectors.update_cnt(1), 2); + + // Un-masking them should update them + config.write_table(12, &u32::to_le_bytes(0x0)); + config.write_table(28, &u32::to_le_bytes(0x0)); + assert_eq!(vectors.update_cnt(0), 3); + assert_eq!(vectors.update_cnt(1), 3); + + // Setting up the same config should have no effect + config.write_table(12, &u32::to_le_bytes(0x0)); + config.write_table(28, &u32::to_le_bytes(0x0)); + assert_eq!(vectors.update_cnt(0), 3); + assert_eq!(vectors.update_cnt(1), 3); + } + + #[test] + #[should_panic] + fn test_table_access_write_too_big() { + let vectors = Arc::new(MockInterrupt::new()); + let mut config = MsixConfig::new(2, vectors.clone(), 0x42, None).unwrap(); + let buffer = [0u8; 16]; + + config.write_table(0, &buffer); + } + + #[test] + fn test_pba_read_too_big() { + let vectors = Arc::new(MockInterrupt::new()); + let config = MsixConfig::new(2, vectors.clone(), 0x42, None).unwrap(); + let mut buffer = [0u8; 16]; + + config.read_pba(0, &mut buffer); + assert_eq!(buffer, [0xff; 16]); + } + + #[test] + fn test_pba_invalid_offset() { + let vectors = Arc::new(MockInterrupt::new()); + let config = MsixConfig::new(2, vectors.clone(), 0x42, None).unwrap(); + let mut buffer = [0u8; 8]; + + // Past the end of the PBA array + config.read_pba(128, &mut buffer); + assert_eq!(buffer, [0xffu8; 8]); + + // Invalid offset within a valid entry + let mut buffer = [0u8; 8]; + config.read_pba(3, &mut buffer[..4]); + assert_eq!(buffer[..4], [0xffu8; 4]); + config.read_pba(3, &mut buffer); + assert_eq!(buffer, [0xffu8; 8]); + } + + #[test] + #[should_panic] + fn test_set_pba_bit_vector_too_big() { + let vectors = Arc::new(MockInterrupt::new()); + let mut config = MsixConfig::new(2, vectors.clone(), 0x42, None).unwrap(); + + config.set_pba_bit(2048, false); + } + + #[test] + #[should_panic] + fn test_get_pba_bit_vector_too_big() { + let vectors = Arc::new(MockInterrupt::new()); + let config = MsixConfig::new(2, vectors.clone(), 0x42, None).unwrap(); + + config.get_pba_bit(2048); + } + + #[test] + fn test_pba_bit_invalid_vector() { + let vectors = Arc::new(MockInterrupt::new()); + let mut config = MsixConfig::new(2, vectors.clone(), 0x42, None).unwrap(); + + // We have two vectors, so setting the pending bit for the third one + // should be ignored + config.set_pba_bit(2, false); + assert_eq!(config.pba_entries[0], 0); + + // Same for getting the bit + assert_eq!(config.get_pba_bit(2), 0xff); } - pub fn table_range(&self) -> (u64, u64) { - // The table takes 16 bytes per entry. - let size = self.table_size() as u64 * 16; - (self.table_offset() as u64, size) + #[test] + fn test_pba_read() { + let vectors = Arc::new(MockInterrupt::new()); + let mut config = MsixConfig::new(128, vectors.clone(), 0x42, None).unwrap(); + let mut buffer = [0u8; 8]; + + config.set_pba_bit(1, false); + assert_eq!(config.pba_entries[0], 2); + assert_eq!(config.pba_entries[1], 0); + config.read_pba(0, &mut buffer); + assert_eq!(0x2, u64::from_le_bytes(buffer)); + + let mut buffer = [0u8; 4]; + config.set_pba_bit(96, false); + assert_eq!(config.pba_entries[0], 2); + assert_eq!(config.pba_entries[1], 0x1_0000_0000); + config.read_pba(8, &mut buffer); + assert_eq!(0x0, u32::from_le_bytes(buffer)); + config.read_pba(12, &mut buffer); + assert_eq!(0x1, u32::from_le_bytes(buffer)); } - pub fn pba_range(&self) -> (u64, u64) { - // The table takes 1 bit per entry modulo 8 bytes. - let size = ((self.table_size() as u64 / 64) + 1) * 8; - (self.pba_offset() as u64, size) + #[test] + fn test_pending_interrupt() { + let vectors = Arc::new(MockInterrupt::new()); + let mut config = MsixConfig::new(2, vectors.clone(), 0x42, None).unwrap(); + config.set_pba_bit(1, false); + assert_eq!(config.get_pba_bit(1), 1); + // Enable MSI-X vector and unmask interrupts + config.set_msg_ctl(0x8000); + + // Individual vectors are still masked, so no change + assert_eq!(vectors.interrupt_cnt(0), 0); + assert_eq!(vectors.interrupt_cnt(1), 0); + + // Enable all vectors + config.write_table(8, &u64::to_le_bytes(0x0_0000_0020)); + config.write_table(24, &u64::to_le_bytes(0x0_0000_0020)); + + // Vector one had a pending bit, so we must have triggered an interrupt for it + // and cleared the pending bit + assert_eq!(vectors.interrupt_cnt(0), 0); + assert_eq!(vectors.interrupt_cnt(1), 1); + assert_eq!(config.get_pba_bit(1), 0); + + // Check that interrupt is sent as well for enabled vectors once we unmask from + // Message Control + + // Mask vectors and set pending bit for vector 0 + config.set_msg_ctl(0xc000); + config.set_pba_bit(0, false); + assert_eq!(vectors.interrupt_cnt(0), 0); + assert_eq!(vectors.interrupt_cnt(1), 1); + + // Unmask them + config.set_msg_ctl(0x8000); + assert_eq!(vectors.interrupt_cnt(0), 1); + assert_eq!(vectors.interrupt_cnt(1), 1); + assert_eq!(config.get_pba_bit(0), 0); } } diff --git a/src/vmm/src/devices/virtio/transport/pci/device.rs b/src/vmm/src/devices/virtio/transport/pci/device.rs index 12d6ff10345..038264bb417 100644 --- a/src/vmm/src/devices/virtio/transport/pci/device.rs +++ b/src/vmm/src/devices/virtio/transport/pci/device.rs @@ -783,7 +783,7 @@ impl VirtioInterrupt for VirtioInterruptMsix { // device should not inject the interrupt. // Instead, the Pending Bit Array table is updated to reflect there // is a pending interrupt for this specific vector. - if config.masked() || entry.masked() { + if config.masked || entry.masked() { config.set_pba_bit(vector, false); return Ok(()); }