From b76f92eddca7eedd700d51e60565648139f2041a Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 16 Apr 2025 10:16:02 +0200 Subject: [PATCH 01/56] chore: prepare virtio for multiple transport options This is just code organization changes. Create a new module under `virtio`, called `transport`. For the time being the only transport supported is `mmio`. Also, move `IrqInterrupt` type within the MMIO transport code, as it is MMIO specific. Signed-off-by: Babis Chalios --- src/vmm/src/builder.rs | 2 +- src/vmm/src/device_manager/mmio.rs | 7 +- src/vmm/src/device_manager/persist.rs | 2 +- src/vmm/src/devices/bus.rs | 2 +- src/vmm/src/devices/virtio/balloon/device.rs | 2 +- .../src/devices/virtio/balloon/test_utils.rs | 2 +- src/vmm/src/devices/virtio/block/device.rs | 3 +- .../devices/virtio/block/vhost_user/device.rs | 7 +- .../src/devices/virtio/block/virtio/device.rs | 5 +- .../devices/virtio/block/virtio/persist.rs | 3 +- .../devices/virtio/block/virtio/test_utils.rs | 4 +- src/vmm/src/devices/virtio/device.rs | 87 +------------------ src/vmm/src/devices/virtio/mod.rs | 2 +- src/vmm/src/devices/virtio/net/device.rs | 4 +- src/vmm/src/devices/virtio/net/test_utils.rs | 7 +- src/vmm/src/devices/virtio/persist.rs | 4 +- src/vmm/src/devices/virtio/rng/device.rs | 3 +- .../devices/virtio/{ => transport}/mmio.rs | 86 +++++++++++++++++- src/vmm/src/devices/virtio/transport/mod.rs | 5 ++ src/vmm/src/devices/virtio/vhost_user.rs | 2 +- src/vmm/src/devices/virtio/vsock/device.rs | 5 +- 21 files changed, 130 insertions(+), 114 deletions(-) rename src/vmm/src/devices/virtio/{ => transport}/mmio.rs (92%) create mode 100644 src/vmm/src/devices/virtio/transport/mod.rs diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 4a810ee083a..84138afd79d 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -44,9 +44,9 @@ use crate::devices::legacy::{EventFdTrigger, SerialEventsWrapper, SerialWrapper} use crate::devices::virtio::balloon::Balloon; use crate::devices::virtio::block::device::Block; use crate::devices::virtio::device::VirtioDevice; -use crate::devices::virtio::mmio::MmioTransport; use crate::devices::virtio::net::Net; use crate::devices::virtio::rng::Entropy; +use crate::devices::virtio::transport::mmio::MmioTransport; use crate::devices::virtio::vsock::{Vsock, VsockUnixBackend}; #[cfg(feature = "gdb")] use crate::gdb; diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 394935fe5c1..b4f1d39412c 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -30,9 +30,9 @@ use crate::devices::pseudo::BootTimer; use crate::devices::virtio::balloon::Balloon; use crate::devices::virtio::block::device::Block; use crate::devices::virtio::device::VirtioDevice; -use crate::devices::virtio::mmio::MmioTransport; use crate::devices::virtio::net::Net; use crate::devices::virtio::rng::Entropy; +use crate::devices::virtio::transport::mmio::MmioTransport; use crate::devices::virtio::vsock::{TYPE_VSOCK, Vsock, VsockUnixBackend}; use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET, TYPE_RNG}; #[cfg(target_arch = "x86_64")] @@ -223,7 +223,7 @@ impl MMIODeviceManager { device_info: &MMIODeviceInfo, ) -> Result<(), MmioError> { // as per doc, [virtio_mmio.]device=@: needs to be appended - // to kernel command line for virtio mmio devices to get recongnized + // to kernel command line for virtio mmio devices to get recognized // the size parameter has to be transformed to KiB, so dividing hexadecimal value in // bytes to 1024; further, the '{}' formatting rust construct will automatically // transform it to decimal @@ -530,8 +530,9 @@ mod tests { use super::*; use crate::Vm; use crate::devices::virtio::ActivateError; - use crate::devices::virtio::device::{IrqTrigger, VirtioDevice}; + use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::queue::Queue; + use crate::devices::virtio::transport::mmio::IrqTrigger; use crate::test_utils::multi_region_mem_raw; use crate::vstate::kvm::Kvm; use crate::vstate::memory::{GuestAddress, GuestMemoryMmap}; diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 30a6387bc82..43ded58c4b7 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -25,7 +25,6 @@ use crate::devices::virtio::block::BlockError; use crate::devices::virtio::block::device::Block; use crate::devices::virtio::block::persist::{BlockConstructorArgs, BlockState}; use crate::devices::virtio::device::VirtioDevice; -use crate::devices::virtio::mmio::MmioTransport; use crate::devices::virtio::net::Net; use crate::devices::virtio::net::persist::{ NetConstructorArgs, NetPersistError as NetError, NetState, @@ -35,6 +34,7 @@ use crate::devices::virtio::rng::Entropy; use crate::devices::virtio::rng::persist::{ EntropyConstructorArgs, EntropyPersistError as EntropyError, EntropyState, }; +use crate::devices::virtio::transport::mmio::MmioTransport; use crate::devices::virtio::vsock::persist::{ VsockConstructorArgs, VsockState, VsockUdsConstructorArgs, }; diff --git a/src/vmm/src/devices/bus.rs b/src/vmm/src/devices/bus.rs index 2b016d73083..d0e1b296998 100644 --- a/src/vmm/src/devices/bus.rs +++ b/src/vmm/src/devices/bus.rs @@ -56,7 +56,7 @@ use event_manager::{EventOps, Events, MutEventSubscriber}; use super::legacy::RTCDevice; use super::legacy::{I8042Device, SerialDevice}; use super::pseudo::BootTimer; -use super::virtio::mmio::MmioTransport; +use super::virtio::transport::mmio::MmioTransport; #[derive(Debug)] pub enum BusDevice { diff --git a/src/vmm/src/devices/virtio/balloon/device.rs b/src/vmm/src/devices/virtio/balloon/device.rs index f9acbcf2c9b..c8601866b2b 100644 --- a/src/vmm/src/devices/virtio/balloon/device.rs +++ b/src/vmm/src/devices/virtio/balloon/device.rs @@ -24,9 +24,9 @@ use super::{ VIRTIO_BALLOON_S_SWAP_OUT, }; use crate::devices::virtio::balloon::BalloonError; -use crate::devices::virtio::device::{IrqTrigger, IrqType}; use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; use crate::devices::virtio::queue::InvalidAvailIdx; +use crate::devices::virtio::transport::mmio::{IrqTrigger, IrqType}; use crate::logger::IncMetric; use crate::utils::u64_to_usize; use crate::vstate::memory::{Address, ByteValued, Bytes, GuestAddress, GuestMemoryMmap}; diff --git a/src/vmm/src/devices/virtio/balloon/test_utils.rs b/src/vmm/src/devices/virtio/balloon/test_utils.rs index af0d7f5845e..69b0b4f92a0 100644 --- a/src/vmm/src/devices/virtio/balloon/test_utils.rs +++ b/src/vmm/src/devices/virtio/balloon/test_utils.rs @@ -10,7 +10,7 @@ use crate::devices::virtio::{balloon::BALLOON_NUM_QUEUES, balloon::Balloon}; #[cfg(test)] pub fn invoke_handler_for_queue_event(b: &mut Balloon, queue_index: usize) { use crate::devices::virtio::balloon::{DEFLATE_INDEX, INFLATE_INDEX, STATS_INDEX}; - use crate::devices::virtio::device::IrqType; + use crate::devices::virtio::transport::mmio::IrqType; assert!(queue_index < BALLOON_NUM_QUEUES); // Trigger the queue event. diff --git a/src/vmm/src/devices/virtio/block/device.rs b/src/vmm/src/devices/virtio/block/device.rs index 5d41eb04078..4f4676a24a8 100644 --- a/src/vmm/src/devices/virtio/block/device.rs +++ b/src/vmm/src/devices/virtio/block/device.rs @@ -8,8 +8,9 @@ use super::BlockError; use super::persist::{BlockConstructorArgs, BlockState}; use super::vhost_user::device::{VhostUserBlock, VhostUserBlockConfig}; use super::virtio::device::{VirtioBlock, VirtioBlockConfig}; -use crate::devices::virtio::device::{IrqTrigger, VirtioDevice}; +use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::queue::{InvalidAvailIdx, Queue}; +use crate::devices::virtio::transport::mmio::IrqTrigger; use crate::devices::virtio::{ActivateError, TYPE_BLOCK}; use crate::rate_limiter::BucketUpdate; use crate::snapshot::Persist; diff --git a/src/vmm/src/devices/virtio/block/vhost_user/device.rs b/src/vmm/src/devices/virtio/block/vhost_user/device.rs index a42a2fe0c46..87f6264db4c 100644 --- a/src/vmm/src/devices/virtio/block/vhost_user/device.rs +++ b/src/vmm/src/devices/virtio/block/vhost_user/device.rs @@ -14,11 +14,12 @@ use vmm_sys_util::eventfd::EventFd; use super::{NUM_QUEUES, QUEUE_SIZE, VhostUserBlockError}; use crate::devices::virtio::block::CacheType; -use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice}; +use crate::devices::virtio::device::{DeviceState, VirtioDevice}; use crate::devices::virtio::generated::virtio_blk::{VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_RO}; use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; use crate::devices::virtio::generated::virtio_ring::VIRTIO_RING_F_EVENT_IDX; use crate::devices::virtio::queue::Queue; +use crate::devices::virtio::transport::mmio::{IrqTrigger, IrqType}; use crate::devices::virtio::vhost_user::{VhostUserHandleBackend, VhostUserHandleImpl}; use crate::devices::virtio::vhost_user_metrics::{ VhostUserDeviceMetrics, VhostUserMetricsPerDevice, @@ -34,7 +35,7 @@ const BLOCK_CONFIG_SPACE_SIZE: u32 = 60; const AVAILABLE_FEATURES: u64 = (1 << VIRTIO_F_VERSION_1) | (1 << VIRTIO_RING_F_EVENT_IDX) - // vhost-user specific bit. Not defined in standart virtio spec. + // vhost-user specific bit. Not defined in standard virtio spec. // Specifies ability of frontend to negotiate protocol features. | VhostUserVirtioFeatures::PROTOCOL_FEATURES.bits() // We always try to negotiate readonly with the backend. @@ -375,8 +376,8 @@ mod tests { use super::*; use crate::devices::virtio::block::virtio::device::FileEngineType; - use crate::devices::virtio::mmio::VIRTIO_MMIO_INT_CONFIG; use crate::devices::virtio::test_utils::VirtQueue; + use crate::devices::virtio::transport::mmio::VIRTIO_MMIO_INT_CONFIG; use crate::devices::virtio::vhost_user::tests::create_mem; use crate::test_utils::create_tmp_socket; use crate::vstate::memory::GuestAddress; diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs index 2f5d88114b6..f1e978cc096 100644 --- a/src/vmm/src/devices/virtio/block/virtio/device.rs +++ b/src/vmm/src/devices/virtio/block/virtio/device.rs @@ -23,13 +23,14 @@ use super::request::*; use super::{BLOCK_QUEUE_SIZES, SECTOR_SHIFT, SECTOR_SIZE, VirtioBlockError, io as block_io}; use crate::devices::virtio::block::CacheType; use crate::devices::virtio::block::virtio::metrics::{BlockDeviceMetrics, BlockMetricsPerDevice}; -use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice}; +use crate::devices::virtio::device::{DeviceState, VirtioDevice}; use crate::devices::virtio::generated::virtio_blk::{ VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_RO, VIRTIO_BLK_ID_BYTES, }; use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; use crate::devices::virtio::generated::virtio_ring::VIRTIO_RING_F_EVENT_IDX; use crate::devices::virtio::queue::{InvalidAvailIdx, Queue}; +use crate::devices::virtio::transport::mmio::{IrqTrigger, IrqType}; use crate::devices::virtio::{ActivateError, TYPE_BLOCK}; use crate::logger::{IncMetric, error, warn}; use crate::rate_limiter::{BucketUpdate, RateLimiter}; @@ -829,7 +830,7 @@ mod tests { block.read_config(0, actual_config_space.as_mut_slice()); assert_eq!(actual_config_space, expected_config_space); - // If priviledged user writes to `/dev/mem`, in block config space - byte by byte. + // If privileged user writes to `/dev/mem`, in block config space - byte by byte. let expected_config_space = ConfigSpace { capacity: 0x1122334455667788, }; diff --git a/src/vmm/src/devices/virtio/block/virtio/persist.rs b/src/vmm/src/devices/virtio/block/virtio/persist.rs index 8c6f2c2453d..dafad8e91e6 100644 --- a/src/vmm/src/devices/virtio/block/virtio/persist.rs +++ b/src/vmm/src/devices/virtio/block/virtio/persist.rs @@ -16,9 +16,10 @@ use crate::devices::virtio::TYPE_BLOCK; use crate::devices::virtio::block::persist::BlockConstructorArgs; use crate::devices::virtio::block::virtio::device::FileEngineType; use crate::devices::virtio::block::virtio::metrics::BlockMetricsPerDevice; -use crate::devices::virtio::device::{DeviceState, IrqTrigger}; +use crate::devices::virtio::device::DeviceState; use crate::devices::virtio::generated::virtio_blk::VIRTIO_BLK_F_RO; use crate::devices::virtio::persist::VirtioDeviceState; +use crate::devices::virtio::transport::mmio::IrqTrigger; use crate::rate_limiter::RateLimiter; use crate::rate_limiter::persist::RateLimiterState; use crate::snapshot::Persist; diff --git a/src/vmm/src/devices/virtio/block/virtio/test_utils.rs b/src/vmm/src/devices/virtio/block/virtio/test_utils.rs index 02dd34fbce9..b05e899f32d 100644 --- a/src/vmm/src/devices/virtio/block/virtio/test_utils.rs +++ b/src/vmm/src/devices/virtio/block/virtio/test_utils.rs @@ -16,10 +16,10 @@ use crate::devices::virtio::block::virtio::device::FileEngineType; #[cfg(test)] use crate::devices::virtio::block::virtio::io::FileEngine; use crate::devices::virtio::block::virtio::{CacheType, VirtioBlock}; -#[cfg(test)] -use crate::devices::virtio::device::IrqType; use crate::devices::virtio::queue::{Queue, VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; use crate::devices::virtio::test_utils::{VirtQueue, VirtqDesc}; +#[cfg(test)] +use crate::devices::virtio::transport::mmio::IrqType; use crate::rate_limiter::RateLimiter; use crate::vmm_config::{RateLimiterConfig, TokenBucketConfig}; use crate::vstate::memory::{Bytes, GuestAddress}; diff --git a/src/vmm/src/devices/virtio/device.rs b/src/vmm/src/devices/virtio/device.rs index ba1ca6b279e..2afe8f2f485 100644 --- a/src/vmm/src/devices/virtio/device.rs +++ b/src/vmm/src/devices/virtio/device.rs @@ -7,15 +7,15 @@ use std::fmt; use std::sync::Arc; -use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::atomic::AtomicU32; use vmm_sys_util::eventfd::EventFd; use super::ActivateError; -use super::mmio::{VIRTIO_MMIO_INT_CONFIG, VIRTIO_MMIO_INT_VRING}; use super::queue::{Queue, QueueError}; +use super::transport::mmio::IrqTrigger; use crate::devices::virtio::AsAny; -use crate::logger::{error, warn}; +use crate::logger::warn; use crate::vstate::memory::GuestMemoryMmap; /// Enum that indicates if a VirtioDevice is inactive or has been activated @@ -44,46 +44,6 @@ impl DeviceState { } } -/// The 2 types of interrupt sources in MMIO transport. -#[derive(Debug)] -pub enum IrqType { - /// Interrupt triggered by change in config. - Config, - /// Interrupt triggered by used vring buffers. - Vring, -} - -/// Helper struct that is responsible for triggering guest IRQs -#[derive(Debug)] -pub struct IrqTrigger { - pub(crate) irq_status: Arc, - pub(crate) irq_evt: EventFd, -} - -impl IrqTrigger { - pub fn new() -> std::io::Result { - Ok(Self { - irq_status: Arc::new(AtomicU32::new(0)), - irq_evt: EventFd::new(libc::EFD_NONBLOCK)?, - }) - } - - pub fn trigger_irq(&self, irq_type: IrqType) -> Result<(), std::io::Error> { - let irq = match irq_type { - IrqType::Config => VIRTIO_MMIO_INT_CONFIG, - IrqType::Vring => VIRTIO_MMIO_INT_VRING, - }; - self.irq_status.fetch_or(irq, Ordering::SeqCst); - - self.irq_evt.write(1).map_err(|err| { - error!("Failed to send irq to the guest: {:?}", err); - err - })?; - - Ok(()) - } -} - /// Trait for virtio devices to be driven by a virtio transport. /// /// The lifecycle of a virtio device is to be moved to a virtio transport, which will then query the @@ -200,47 +160,6 @@ impl fmt::Debug for dyn VirtioDevice { pub(crate) mod tests { use super::*; - impl IrqTrigger { - pub fn has_pending_irq(&self, irq_type: IrqType) -> bool { - if let Ok(num_irqs) = self.irq_evt.read() { - if num_irqs == 0 { - return false; - } - - let irq_status = self.irq_status.load(Ordering::SeqCst); - return matches!( - (irq_status, irq_type), - (VIRTIO_MMIO_INT_CONFIG, IrqType::Config) - | (VIRTIO_MMIO_INT_VRING, IrqType::Vring) - ); - } - - false - } - } - - #[test] - fn irq_trigger() { - let irq_trigger = IrqTrigger::new().unwrap(); - assert_eq!(irq_trigger.irq_status.load(Ordering::SeqCst), 0); - - // Check that there are no pending irqs. - assert!(!irq_trigger.has_pending_irq(IrqType::Config)); - assert!(!irq_trigger.has_pending_irq(IrqType::Vring)); - - // Check that trigger_irq() correctly generates irqs. - irq_trigger.trigger_irq(IrqType::Config).unwrap(); - assert!(irq_trigger.has_pending_irq(IrqType::Config)); - irq_trigger.irq_status.store(0, Ordering::SeqCst); - irq_trigger.trigger_irq(IrqType::Vring).unwrap(); - assert!(irq_trigger.has_pending_irq(IrqType::Vring)); - - // Check trigger_irq() failure case (irq_evt is full). - irq_trigger.irq_evt.write(u64::MAX - 1).unwrap(); - irq_trigger.trigger_irq(IrqType::Config).unwrap_err(); - irq_trigger.trigger_irq(IrqType::Vring).unwrap_err(); - } - #[derive(Debug)] struct MockVirtioDevice { acked_features: u64, diff --git a/src/vmm/src/devices/virtio/mod.rs b/src/vmm/src/devices/virtio/mod.rs index f298d28e9bd..0ac3b660397 100644 --- a/src/vmm/src/devices/virtio/mod.rs +++ b/src/vmm/src/devices/virtio/mod.rs @@ -18,12 +18,12 @@ pub mod device; pub mod generated; mod iov_deque; pub mod iovec; -pub mod mmio; pub mod net; pub mod persist; pub mod queue; pub mod rng; pub mod test_utils; +pub mod transport; pub mod vhost_user; pub mod vhost_user_metrics; pub mod vsock; diff --git a/src/vmm/src/devices/virtio/net/device.rs b/src/vmm/src/devices/virtio/net/device.rs index 2ce60707271..093c83c354b 100755 --- a/src/vmm/src/devices/virtio/net/device.rs +++ b/src/vmm/src/devices/virtio/net/device.rs @@ -15,7 +15,7 @@ use log::error; use vmm_sys_util::eventfd::EventFd; use super::NET_QUEUE_MAX_SIZE; -use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice}; +use crate::devices::virtio::device::{DeviceState, VirtioDevice}; use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; use crate::devices::virtio::generated::virtio_net::{ VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, @@ -32,6 +32,7 @@ use crate::devices::virtio::net::{ MAX_BUFFER_SIZE, NET_QUEUE_SIZES, NetError, NetQueue, RX_INDEX, TX_INDEX, generated, }; use crate::devices::virtio::queue::{DescriptorChain, InvalidAvailIdx, Queue}; +use crate::devices::virtio::transport::mmio::{IrqTrigger, IrqType}; use crate::devices::virtio::{ActivateError, TYPE_NET}; use crate::devices::{DeviceError, report_net_event_fail}; use crate::dumbo::pdu::arp::ETH_IPV4_FRAME_LEN; @@ -1059,6 +1060,7 @@ pub mod tests { }; use crate::devices::virtio::queue::VIRTQ_DESC_F_WRITE; use crate::devices::virtio::test_utils::VirtQueue; + use crate::devices::virtio::transport::mmio::IrqType; use crate::dumbo::EthernetFrame; use crate::dumbo::pdu::arp::{ETH_IPV4_FRAME_LEN, EthIPv4ArpFrame}; use crate::dumbo::pdu::ethernet::ETHERTYPE_ARP; diff --git a/src/vmm/src/devices/virtio/net/test_utils.rs b/src/vmm/src/devices/virtio/net/test_utils.rs index 2df7891e034..ec52883e979 100644 --- a/src/vmm/src/devices/virtio/net/test_utils.rs +++ b/src/vmm/src/devices/virtio/net/test_utils.rs @@ -103,7 +103,7 @@ impl TapTrafficSimulator { let send_addr_ptr = &mut storage as *mut libc::sockaddr_storage; - // SAFETY: `sock_addr` is a valid pointer and safe to derference. + // SAFETY: `sock_addr` is a valid pointer and safe to dereference. unsafe { let sock_addr: *mut libc::sockaddr_ll = send_addr_ptr.cast::(); (*sock_addr).sll_family = libc::sa_family_t::try_from(libc::AF_PACKET).unwrap(); @@ -222,7 +222,7 @@ pub fn if_index(tap: &Tap) -> i32 { /// Enable the tap interface. pub fn enable(tap: &Tap) { - // Disable IPv6 router advertisment requests + // Disable IPv6 router advertisement requests Command::new("sh") .arg("-c") .arg(format!( @@ -291,7 +291,7 @@ pub mod test { use event_manager::{EventManager, SubscriberId, SubscriberOps}; use crate::check_metric_after_block; - use crate::devices::virtio::device::{IrqType, VirtioDevice}; + use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::net::device::vnet_hdr_len; use crate::devices::virtio::net::generated::ETH_HLEN; use crate::devices::virtio::net::test_utils::{ @@ -300,6 +300,7 @@ pub mod test { use crate::devices::virtio::net::{MAX_BUFFER_SIZE, Net, RX_INDEX, TX_INDEX}; use crate::devices::virtio::queue::{VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; use crate::devices::virtio::test_utils::{VirtQueue, VirtqDesc}; + use crate::devices::virtio::transport::mmio::IrqType; use crate::logger::IncMetric; use crate::vstate::memory::{Address, Bytes, GuestAddress, GuestMemoryMmap}; diff --git a/src/vmm/src/devices/virtio/persist.rs b/src/vmm/src/devices/virtio/persist.rs index 664f6d57efb..1a1eb6dba7d 100644 --- a/src/vmm/src/devices/virtio/persist.rs +++ b/src/vmm/src/devices/virtio/persist.rs @@ -12,8 +12,8 @@ use serde::{Deserialize, Serialize}; use super::queue::{InvalidAvailIdx, QueueError}; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::generated::virtio_ring::VIRTIO_RING_F_EVENT_IDX; -use crate::devices::virtio::mmio::MmioTransport; use crate::devices::virtio::queue::Queue; +use crate::devices::virtio::transport::mmio::MmioTransport; use crate::snapshot::Persist; use crate::vstate::memory::{GuestAddress, GuestMemoryMmap}; @@ -256,10 +256,10 @@ mod tests { use crate::devices::virtio::block::virtio::VirtioBlock; use crate::devices::virtio::block::virtio::device::FileEngineType; use crate::devices::virtio::block::virtio::test_utils::default_block_with_path; - use crate::devices::virtio::mmio::tests::DummyDevice; use crate::devices::virtio::net::Net; use crate::devices::virtio::net::test_utils::default_net; use crate::devices::virtio::test_utils::default_mem; + use crate::devices::virtio::transport::mmio::tests::DummyDevice; use crate::devices::virtio::vsock::{Vsock, VsockUnixBackend}; use crate::snapshot::Snapshot; diff --git a/src/vmm/src/devices/virtio/rng/device.rs b/src/vmm/src/devices/virtio/rng/device.rs index 38308e9b6b7..2ee9834167d 100644 --- a/src/vmm/src/devices/virtio/rng/device.rs +++ b/src/vmm/src/devices/virtio/rng/device.rs @@ -12,11 +12,12 @@ use vmm_sys_util::eventfd::EventFd; use super::metrics::METRICS; use super::{RNG_NUM_QUEUES, RNG_QUEUE}; use crate::devices::DeviceError; -use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice}; +use crate::devices::virtio::device::{DeviceState, VirtioDevice}; use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; use crate::devices::virtio::iov_deque::IovDequeError; use crate::devices::virtio::iovec::IoVecBufferMut; use crate::devices::virtio::queue::{FIRECRACKER_MAX_QUEUE_SIZE, InvalidAvailIdx, Queue}; +use crate::devices::virtio::transport::mmio::{IrqTrigger, IrqType}; use crate::devices::virtio::{ActivateError, TYPE_RNG}; use crate::logger::{IncMetric, debug, error}; use crate::rate_limiter::{RateLimiter, TokenType}; diff --git a/src/vmm/src/devices/virtio/mmio.rs b/src/vmm/src/devices/virtio/transport/mmio.rs similarity index 92% rename from src/vmm/src/devices/virtio/mmio.rs rename to src/vmm/src/devices/virtio/transport/mmio.rs index 4114838bdd3..b6e2b796398 100644 --- a/src/vmm/src/devices/virtio/mmio.rs +++ b/src/vmm/src/devices/virtio/transport/mmio.rs @@ -9,7 +9,9 @@ use std::fmt::Debug; use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Mutex, MutexGuard}; -use crate::devices::virtio::device::{IrqType, VirtioDevice}; +use vmm_sys_util::eventfd::EventFd; + +use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::device_status; use crate::devices::virtio::queue::Queue; use crate::logger::{error, warn}; @@ -356,13 +358,52 @@ impl MmioTransport { } } +/// The 2 types of interrupt sources in MMIO transport. +#[derive(Debug)] +pub enum IrqType { + /// Interrupt triggered by change in config. + Config, + /// Interrupt triggered by used vring buffers. + Vring, +} + +/// Helper struct that is responsible for triggering guest IRQs +#[derive(Debug)] +pub struct IrqTrigger { + pub(crate) irq_status: Arc, + pub(crate) irq_evt: EventFd, +} + +impl IrqTrigger { + pub fn new() -> std::io::Result { + Ok(Self { + irq_status: Arc::new(AtomicU32::new(0)), + irq_evt: EventFd::new(libc::EFD_NONBLOCK)?, + }) + } + + pub fn trigger_irq(&self, irq_type: IrqType) -> Result<(), std::io::Error> { + let irq = match irq_type { + IrqType::Config => VIRTIO_MMIO_INT_CONFIG, + IrqType::Vring => VIRTIO_MMIO_INT_VRING, + }; + self.irq_status.fetch_or(irq, Ordering::SeqCst); + + self.irq_evt.write(1).map_err(|err| { + error!("Failed to send irq to the guest: {:?}", err); + err + })?; + + Ok(()) + } +} + #[cfg(test)] pub(crate) mod tests { use vmm_sys_util::eventfd::EventFd; use super::*; use crate::devices::virtio::ActivateError; - use crate::devices::virtio::device::IrqTrigger; use crate::devices::virtio::device_status::DEVICE_NEEDS_RESET; use crate::test_utils::single_region_mem; use crate::utils::byte_order::{read_le_u32, write_le_u32}; @@ -968,4 +1009,45 @@ pub(crate) mod tests { dummy_dev.ack_features_by_page(0, 8); assert_eq!(dummy_dev.acked_features(), 24); } + + impl IrqTrigger { + pub fn has_pending_irq(&self, irq_type: IrqType) -> bool { + if let Ok(num_irqs) = self.irq_evt.read() { + if num_irqs == 0 { + return false; + } + + let irq_status = self.irq_status.load(Ordering::SeqCst); + return matches!( + (irq_status, irq_type), + (VIRTIO_MMIO_INT_CONFIG, IrqType::Config) + | (VIRTIO_MMIO_INT_VRING, IrqType::Vring) + ); + } + + false + } + } + + #[test] + fn irq_trigger() { + let irq_trigger = IrqTrigger::new().unwrap(); + assert_eq!(irq_trigger.irq_status.load(Ordering::SeqCst), 0); + + // Check that there are no pending irqs. + assert!(!irq_trigger.has_pending_irq(IrqType::Config)); + assert!(!irq_trigger.has_pending_irq(IrqType::Vring)); + + // Check that trigger_irq() correctly generates irqs. + irq_trigger.trigger_irq(IrqType::Config).unwrap(); + assert!(irq_trigger.has_pending_irq(IrqType::Config)); + irq_trigger.irq_status.store(0, Ordering::SeqCst); + irq_trigger.trigger_irq(IrqType::Vring).unwrap(); + assert!(irq_trigger.has_pending_irq(IrqType::Vring)); + + // Check trigger_irq() failure case (irq_evt is full). + irq_trigger.irq_evt.write(u64::MAX - 1).unwrap(); + irq_trigger.trigger_irq(IrqType::Config).unwrap_err(); + irq_trigger.trigger_irq(IrqType::Vring).unwrap_err(); + } } diff --git a/src/vmm/src/devices/virtio/transport/mod.rs b/src/vmm/src/devices/virtio/transport/mod.rs new file mode 100644 index 00000000000..1ff8229a1c8 --- /dev/null +++ b/src/vmm/src/devices/virtio/transport/mod.rs @@ -0,0 +1,5 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +/// MMIO transport for VirtIO devices +pub mod mmio; diff --git a/src/vmm/src/devices/virtio/vhost_user.rs b/src/vmm/src/devices/virtio/vhost_user.rs index 13b0d71b35a..d90ad16b08c 100644 --- a/src/vmm/src/devices/virtio/vhost_user.rs +++ b/src/vmm/src/devices/virtio/vhost_user.rs @@ -13,8 +13,8 @@ use vhost::{Error as VhostError, VhostBackend, VhostUserMemoryRegionInfo, VringC use vm_memory::{Address, Error as MmapError, GuestMemory, GuestMemoryError, GuestMemoryRegion}; use vmm_sys_util::eventfd::EventFd; -use crate::devices::virtio::device::IrqTrigger; use crate::devices::virtio::queue::Queue; +use crate::devices::virtio::transport::mmio::IrqTrigger; use crate::vstate::memory::GuestMemoryMmap; /// vhost-user error. diff --git a/src/vmm/src/devices/virtio/vsock/device.rs b/src/vmm/src/devices/virtio/vsock/device.rs index a4377768322..e0b8477123a 100644 --- a/src/vmm/src/devices/virtio/vsock/device.rs +++ b/src/vmm/src/devices/virtio/vsock/device.rs @@ -6,7 +6,7 @@ // found in the THIRD-PARTY file. //! This is the `VirtioDevice` implementation for our vsock device. It handles the virtio-level -//! device logic: feature negociation, device configuration, and device activation. +//! device logic: feature negotiation, device configuration, and device activation. //! //! We aim to conform to the VirtIO v1.1 spec: //! https://docs.oasis-open.org/virtio/virtio/v1.1/virtio-v1.1.html @@ -30,9 +30,10 @@ use super::defs::uapi; use super::packet::{VSOCK_PKT_HDR_SIZE, VsockPacketRx, VsockPacketTx}; use super::{VsockBackend, defs}; use crate::devices::virtio::ActivateError; -use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice}; +use crate::devices::virtio::device::{DeviceState, VirtioDevice}; use crate::devices::virtio::generated::virtio_config::{VIRTIO_F_IN_ORDER, VIRTIO_F_VERSION_1}; use crate::devices::virtio::queue::{InvalidAvailIdx, Queue as VirtQueue}; +use crate::devices::virtio::transport::mmio::{IrqTrigger, IrqType}; use crate::devices::virtio::vsock::VsockError; use crate::devices::virtio::vsock::metrics::METRICS; use crate::logger::IncMetric; From 6e588c611c8b9a198313018bd17842cb0515f80e Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 23 Apr 2025 13:56:00 +0200 Subject: [PATCH 02/56] chore: avoid IrqTrigger::new().unwrap() `IrqTrigger::new()` returns a `Result` because creating an `EventFd` might fail with an `std::io::Error` error. All users of `IrqTrigger` create the object and directly unwrap the error. To avoid unwraps all over the place, change `IrqTrigger::new()` to unwrap a potential error while creating the EventFd internally and just return `Self`. Signed-off-by: Babis Chalios --- src/vmm/src/device_manager/mmio.rs | 2 +- src/vmm/src/devices/virtio/balloon/device.rs | 2 +- .../devices/virtio/block/vhost_user/device.rs | 2 +- .../src/devices/virtio/block/virtio/device.rs | 2 +- .../devices/virtio/block/virtio/persist.rs | 2 +- src/vmm/src/devices/virtio/net/device.rs | 2 +- src/vmm/src/devices/virtio/rng/device.rs | 2 +- src/vmm/src/devices/virtio/transport/mmio.rs | 19 +++++++++++++------ src/vmm/src/devices/virtio/vhost_user.rs | 2 +- src/vmm/src/devices/virtio/vsock/device.rs | 2 +- 10 files changed, 22 insertions(+), 15 deletions(-) diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index b4f1d39412c..c0e11997f96 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -585,7 +585,7 @@ mod tests { dummy: 0, queues: QUEUE_SIZES.iter().map(|&s| Queue::new(s)).collect(), queue_evts: [EventFd::new(libc::EFD_NONBLOCK).expect("cannot create eventFD")], - interrupt_trigger: IrqTrigger::new().expect("cannot create eventFD"), + interrupt_trigger: IrqTrigger::new(), } } } diff --git a/src/vmm/src/devices/virtio/balloon/device.rs b/src/vmm/src/devices/virtio/balloon/device.rs index c8601866b2b..f6ad2f054a5 100644 --- a/src/vmm/src/devices/virtio/balloon/device.rs +++ b/src/vmm/src/devices/virtio/balloon/device.rs @@ -243,7 +243,7 @@ impl Balloon { }, queue_evts, queues, - irq_trigger: IrqTrigger::new().map_err(BalloonError::EventFd)?, + irq_trigger: IrqTrigger::new(), device_state: DeviceState::Inactive, activate_evt: EventFd::new(libc::EFD_NONBLOCK).map_err(BalloonError::EventFd)?, restored_from_file, diff --git a/src/vmm/src/devices/virtio/block/vhost_user/device.rs b/src/vmm/src/devices/virtio/block/vhost_user/device.rs index 87f6264db4c..264db2fa7f0 100644 --- a/src/vmm/src/devices/virtio/block/vhost_user/device.rs +++ b/src/vmm/src/devices/virtio/block/vhost_user/device.rs @@ -204,7 +204,7 @@ impl VhostUserBlockImpl { let queue_evts = [EventFd::new(libc::EFD_NONBLOCK).map_err(VhostUserBlockError::EventFd)?; u64_to_usize(NUM_QUEUES)]; let device_state = DeviceState::Inactive; - let irq_trigger = IrqTrigger::new().map_err(VhostUserBlockError::IrqTrigger)?; + let irq_trigger = IrqTrigger::new(); // We negotiated features with backend. Now these acked_features // are available for guest driver to choose from. diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs index f1e978cc096..e89443e5bd9 100644 --- a/src/vmm/src/devices/virtio/block/virtio/device.rs +++ b/src/vmm/src/devices/virtio/block/virtio/device.rs @@ -323,7 +323,7 @@ impl VirtioBlock { queues, queue_evts, device_state: DeviceState::Inactive, - irq_trigger: IrqTrigger::new().map_err(VirtioBlockError::IrqTrigger)?, + irq_trigger: IrqTrigger::new(), id: config.drive_id.clone(), partuuid: config.partuuid, diff --git a/src/vmm/src/devices/virtio/block/virtio/persist.rs b/src/vmm/src/devices/virtio/block/virtio/persist.rs index dafad8e91e6..33a33968e53 100644 --- a/src/vmm/src/devices/virtio/block/virtio/persist.rs +++ b/src/vmm/src/devices/virtio/block/virtio/persist.rs @@ -112,7 +112,7 @@ impl Persist<'_> for VirtioBlock { ) .map_err(VirtioBlockError::Persist)?; - let mut irq_trigger = IrqTrigger::new().map_err(VirtioBlockError::IrqTrigger)?; + let mut irq_trigger = IrqTrigger::new(); irq_trigger.irq_status = Arc::new(AtomicU32::new(state.virtio_state.interrupt_status)); let avail_features = state.virtio_state.avail_features; diff --git a/src/vmm/src/devices/virtio/net/device.rs b/src/vmm/src/devices/virtio/net/device.rs index 093c83c354b..55a94636495 100755 --- a/src/vmm/src/devices/virtio/net/device.rs +++ b/src/vmm/src/devices/virtio/net/device.rs @@ -314,7 +314,7 @@ impl Net { tx_rate_limiter, rx_frame_buf: [0u8; MAX_BUFFER_SIZE], tx_frame_headers: [0u8; frame_hdr_len()], - irq_trigger: IrqTrigger::new().map_err(NetError::EventFd)?, + irq_trigger: IrqTrigger::new(), config_space, guest_mac, device_state: DeviceState::Inactive, diff --git a/src/vmm/src/devices/virtio/rng/device.rs b/src/vmm/src/devices/virtio/rng/device.rs index 2ee9834167d..d644161d87e 100644 --- a/src/vmm/src/devices/virtio/rng/device.rs +++ b/src/vmm/src/devices/virtio/rng/device.rs @@ -70,7 +70,7 @@ impl Entropy { let queue_events = (0..RNG_NUM_QUEUES) .map(|_| EventFd::new(libc::EFD_NONBLOCK)) .collect::, io::Error>>()?; - let irq_trigger = IrqTrigger::new()?; + let irq_trigger = IrqTrigger::new(); Ok(Self { avail_features: 1 << VIRTIO_F_VERSION_1, diff --git a/src/vmm/src/devices/virtio/transport/mmio.rs b/src/vmm/src/devices/virtio/transport/mmio.rs index b6e2b796398..a763ff811dc 100644 --- a/src/vmm/src/devices/virtio/transport/mmio.rs +++ b/src/vmm/src/devices/virtio/transport/mmio.rs @@ -374,12 +374,19 @@ pub struct IrqTrigger { pub(crate) irq_evt: EventFd, } +impl Default for IrqTrigger { + fn default() -> Self { + Self::new() + } +} + impl IrqTrigger { - pub fn new() -> std::io::Result { - Ok(Self { + pub fn new() -> Self { + Self { irq_status: Arc::new(AtomicU32::new(0)), - irq_evt: EventFd::new(libc::EFD_NONBLOCK)?, - }) + irq_evt: EventFd::new(libc::EFD_NONBLOCK) + .expect("Could not create EventFd for IrqTrigger"), + } } pub fn trigger_irq(&self, irq_type: IrqType) -> Result<(), std::io::Error> { @@ -427,7 +434,7 @@ pub(crate) mod tests { DummyDevice { acked_features: 0, avail_features: 0, - interrupt_trigger: IrqTrigger::new().unwrap(), + interrupt_trigger: IrqTrigger::new(), queue_evts: vec![ EventFd::new(libc::EFD_NONBLOCK).unwrap(), EventFd::new(libc::EFD_NONBLOCK).unwrap(), @@ -1031,7 +1038,7 @@ pub(crate) mod tests { #[test] fn irq_trigger() { - let irq_trigger = IrqTrigger::new().unwrap(); + let irq_trigger = IrqTrigger::new(); assert_eq!(irq_trigger.irq_status.load(Ordering::SeqCst), 0); // Check that there are no pending irqs. diff --git a/src/vmm/src/devices/virtio/vhost_user.rs b/src/vmm/src/devices/virtio/vhost_user.rs index d90ad16b08c..4f895e5c05e 100644 --- a/src/vmm/src/devices/virtio/vhost_user.rs +++ b/src/vmm/src/devices/virtio/vhost_user.rs @@ -901,7 +901,7 @@ pub(crate) mod tests { queue.initialize(&guest_memory).unwrap(); let event_fd = EventFd::new(0).unwrap(); - let irq_trigger = IrqTrigger::new().unwrap(); + let irq_trigger = IrqTrigger::new(); let queues = [(0, &queue, &event_fd)]; diff --git a/src/vmm/src/devices/virtio/vsock/device.rs b/src/vmm/src/devices/virtio/vsock/device.rs index e0b8477123a..fc51a61532c 100644 --- a/src/vmm/src/devices/virtio/vsock/device.rs +++ b/src/vmm/src/devices/virtio/vsock/device.rs @@ -103,7 +103,7 @@ where backend, avail_features: AVAIL_FEATURES, acked_features: 0, - irq_trigger: IrqTrigger::new().map_err(VsockError::EventFd)?, + irq_trigger: IrqTrigger::new(), activate_evt: EventFd::new(libc::EFD_NONBLOCK).map_err(VsockError::EventFd)?, device_state: DeviceState::Inactive, rx_packet: VsockPacketRx::new()?, From c2a683384c93381ad642f889b8d2cc6b1650ef92 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Thu, 17 Apr 2025 13:18:43 +0200 Subject: [PATCH 03/56] refactor: set VirtIO interrupt during activation The MMIO transport for VirtIO devices uses an `IrqTrigger` object as the object that models the logic for sending interrupts from the device to the guest. We create one such object for every VirtIO device when creating it. The MMIO device manager associates this object with an IRQ number and registers it with KVM. This commit changes the timing of association of an `IrqTrigger` with a VirtIO-mmio device. It only assigns such an object to the device during its activation. We do this to prepare for supporting a PCI transport for VirtIO devices. The cloud hypervisor implementation for these passes the interrupt objects used by the device during activation, so we make this change to have a uniform way to handle interrupts for both transports. Functionally, nothing changes for MMIO devices, as before activation we don't trigger any interrupts. Signed-off-by: Babis Chalios --- src/vmm/src/builder.rs | 10 +- src/vmm/src/device_manager/mmio.rs | 21 ++- src/vmm/src/device_manager/persist.rs | 24 +++- src/vmm/src/devices/virtio/balloon/device.rs | 89 ++++++------ .../devices/virtio/balloon/event_handler.rs | 9 +- src/vmm/src/devices/virtio/balloon/mod.rs | 2 +- src/vmm/src/devices/virtio/balloon/persist.rs | 24 ++-- .../src/devices/virtio/balloon/test_utils.rs | 4 +- src/vmm/src/devices/virtio/block/device.rs | 16 ++- src/vmm/src/devices/virtio/block/persist.rs | 4 + .../devices/virtio/block/vhost_user/device.rs | 38 +++-- .../src/devices/virtio/block/virtio/device.rs | 134 +++++++++++------- .../virtio/block/virtio/event_handler.rs | 9 +- .../devices/virtio/block/virtio/persist.rs | 31 ++-- .../devices/virtio/block/virtio/test_utils.rs | 12 +- src/vmm/src/devices/virtio/device.rs | 27 +++- src/vmm/src/devices/virtio/net/device.rs | 62 ++++---- src/vmm/src/devices/virtio/net/persist.rs | 20 +-- src/vmm/src/devices/virtio/net/test_utils.rs | 21 ++- src/vmm/src/devices/virtio/persist.rs | 61 +++++--- src/vmm/src/devices/virtio/queue.rs | 2 +- src/vmm/src/devices/virtio/rng/device.rs | 32 ++--- src/vmm/src/devices/virtio/rng/persist.rs | 28 ++-- src/vmm/src/devices/virtio/test_utils.rs | 17 ++- src/vmm/src/devices/virtio/transport/mmio.rs | 82 +++++++---- src/vmm/src/devices/virtio/vsock/device.rs | 34 +++-- .../src/devices/virtio/vsock/event_handler.rs | 26 ++-- src/vmm/src/devices/virtio/vsock/persist.rs | 17 ++- .../src/devices/virtio/vsock/test_utils.rs | 10 +- 29 files changed, 547 insertions(+), 319 deletions(-) diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 84138afd79d..ba54929d451 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -46,7 +46,7 @@ use crate::devices::virtio::block::device::Block; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::net::Net; use crate::devices::virtio::rng::Entropy; -use crate::devices::virtio::transport::mmio::MmioTransport; +use crate::devices::virtio::transport::mmio::{IrqTrigger, MmioTransport}; use crate::devices::virtio::vsock::{Vsock, VsockUnixBackend}; #[cfg(feature = "gdb")] use crate::gdb; @@ -657,8 +657,14 @@ fn attach_virtio_device( ) -> Result<(), MmioError> { event_manager.add_subscriber(device.clone()); + let interrupt = Arc::new(IrqTrigger::new()); // The device mutex mustn't be locked here otherwise it will deadlock. - let device = MmioTransport::new(vmm.vm.guest_memory().clone(), device, is_vhost_user); + let device = MmioTransport::new( + vmm.vm.guest_memory().clone(), + interrupt, + device, + is_vhost_user, + ); vmm.mmio_device_manager .register_mmio_virtio_for_boot( vmm.vm.fd(), diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index c0e11997f96..992d3b70d61 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -53,6 +53,8 @@ pub enum MmioError { InvalidDeviceType, /// {0} InternalDeviceError(String), + /// Could not create IRQ for MMIO device: {0} + CreateIrq(#[from] std::io::Error), /// Invalid MMIO IRQ configuration. InvalidIrqConfig, /// Failed to register IO event: {0} @@ -205,7 +207,7 @@ impl MMIODeviceManager { vm.register_ioevent(queue_evt, &io_addr, u32::try_from(i).unwrap()) .map_err(MmioError::RegisterIoEvent)?; } - vm.register_irqfd(&locked_device.interrupt_trigger().irq_evt, irq.get()) + vm.register_irqfd(&mmio_device.interrupt.irq_evt, irq.get()) .map_err(MmioError::RegisterIrqFd)?; } @@ -549,7 +551,8 @@ mod tests { cmdline: &mut kernel_cmdline::Cmdline, dev_id: &str, ) -> Result { - let mmio_device = MmioTransport::new(guest_mem, device, false); + let interrupt = Arc::new(IrqTrigger::new()); + let mmio_device = MmioTransport::new(guest_mem, interrupt, device, false); let device_info = self.register_mmio_virtio_for_boot( vm, resource_allocator, @@ -576,7 +579,7 @@ mod tests { dummy: u32, queues: Vec, queue_evts: [EventFd; 1], - interrupt_trigger: IrqTrigger, + interrupt_trigger: Option>, } impl DummyDevice { @@ -585,7 +588,7 @@ mod tests { dummy: 0, queues: QUEUE_SIZES.iter().map(|&s| Queue::new(s)).collect(), queue_evts: [EventFd::new(libc::EFD_NONBLOCK).expect("cannot create eventFD")], - interrupt_trigger: IrqTrigger::new(), + interrupt_trigger: None, } } } @@ -618,7 +621,9 @@ mod tests { } fn interrupt_trigger(&self) -> &IrqTrigger { - &self.interrupt_trigger + self.interrupt_trigger + .as_ref() + .expect("Device is not activated") } fn ack_features_by_page(&mut self, page: u32, value: u32) { @@ -636,7 +641,11 @@ mod tests { let _ = data; } - fn activate(&mut self, _: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate( + &mut self, + _: GuestMemoryMmap, + _: Arc, + ) -> Result<(), ActivateError> { Ok(()) } diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 43ded58c4b7..2f331e644ad 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -34,7 +34,7 @@ use crate::devices::virtio::rng::Entropy; use crate::devices::virtio::rng::persist::{ EntropyConstructorArgs, EntropyPersistError as EntropyError, EntropyState, }; -use crate::devices::virtio::transport::mmio::MmioTransport; +use crate::devices::virtio::transport::mmio::{IrqTrigger, MmioTransport}; use crate::devices::virtio::vsock::persist::{ VsockConstructorArgs, VsockState, VsockUdsConstructorArgs, }; @@ -473,11 +473,13 @@ impl<'a> Persist<'a> for MMIODeviceManager { as_subscriber: Arc>, id: &String, state: &MmioTransportState, + interrupt: Arc, device_info: &MMIODeviceInfo, event_manager: &mut EventManager| -> Result<(), Self::Error> { let restore_args = MmioTransportConstructorArgs { mem: mem.clone(), + interrupt, device, is_vhost_user, }; @@ -512,9 +514,11 @@ impl<'a> Persist<'a> for MMIODeviceManager { }; if let Some(balloon_state) = &state.balloon_device { + let interrupt = Arc::new(IrqTrigger::new()); let device = Arc::new(Mutex::new(Balloon::restore( BalloonConstructorArgs { mem: mem.clone(), + interrupt: interrupt.clone(), restored_from_file: constructor_args.restored_from_file, }, &balloon_state.device_state, @@ -530,14 +534,19 @@ impl<'a> Persist<'a> for MMIODeviceManager { device, &balloon_state.device_id, &balloon_state.transport_state, + interrupt, &balloon_state.device_info, constructor_args.event_manager, )?; } for block_state in &state.block_devices { + let interrupt = Arc::new(IrqTrigger::new()); let device = Arc::new(Mutex::new(Block::restore( - BlockConstructorArgs { mem: mem.clone() }, + BlockConstructorArgs { + mem: mem.clone(), + interrupt: interrupt.clone(), + }, &block_state.device_state, )?)); @@ -551,6 +560,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { device, &block_state.device_id, &block_state.transport_state, + interrupt, &block_state.device_info, constructor_args.event_manager, )?; @@ -573,9 +583,11 @@ impl<'a> Persist<'a> for MMIODeviceManager { } for net_state in &state.net_devices { + let interrupt = Arc::new(IrqTrigger::new()); let device = Arc::new(Mutex::new(Net::restore( NetConstructorArgs { mem: mem.clone(), + interrupt: interrupt.clone(), mmds: constructor_args .vm_resources .mmds @@ -596,6 +608,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { device, &net_state.device_id, &net_state.transport_state, + interrupt, &net_state.device_info, constructor_args.event_manager, )?; @@ -606,9 +619,11 @@ impl<'a> Persist<'a> for MMIODeviceManager { cid: vsock_state.device_state.frontend.cid, }; let backend = VsockUnixBackend::restore(ctor_args, &vsock_state.device_state.backend)?; + let interrupt = Arc::new(IrqTrigger::new()); let device = Arc::new(Mutex::new(Vsock::restore( VsockConstructorArgs { mem: mem.clone(), + interrupt: interrupt.clone(), backend, }, &vsock_state.device_state.frontend, @@ -624,13 +639,15 @@ impl<'a> Persist<'a> for MMIODeviceManager { device, &vsock_state.device_id, &vsock_state.transport_state, + interrupt, &vsock_state.device_info, constructor_args.event_manager, )?; } if let Some(entropy_state) = &state.entropy_device { - let ctor_args = EntropyConstructorArgs::new(mem.clone()); + let interrupt = Arc::new(IrqTrigger::new()); + let ctor_args = EntropyConstructorArgs::new(mem.clone(), interrupt.clone()); let device = Arc::new(Mutex::new(Entropy::restore( ctor_args, @@ -647,6 +664,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { device, &entropy_state.device_id, &entropy_state.transport_state, + interrupt, &entropy_state.device_info, constructor_args.event_manager, )?; diff --git a/src/vmm/src/devices/virtio/balloon/device.rs b/src/vmm/src/devices/virtio/balloon/device.rs index f6ad2f054a5..e07d287c029 100644 --- a/src/vmm/src/devices/virtio/balloon/device.rs +++ b/src/vmm/src/devices/virtio/balloon/device.rs @@ -1,7 +1,7 @@ // Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 -use std::fmt; +use std::sync::Arc; use std::time::Duration; use log::error; @@ -24,6 +24,7 @@ use super::{ VIRTIO_BALLOON_S_SWAP_OUT, }; use crate::devices::virtio::balloon::BalloonError; +use crate::devices::virtio::device::ActiveState; use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; use crate::devices::virtio::queue::InvalidAvailIdx; use crate::devices::virtio::transport::mmio::{IrqTrigger, IrqType}; @@ -150,6 +151,7 @@ impl BalloonStats { } } +#[derive(Debug)] /// Virtio balloon device. pub struct Balloon { // Virtio fields. @@ -162,7 +164,6 @@ pub struct Balloon { pub(crate) queues: Vec, pub(crate) queue_evts: [EventFd; BALLOON_NUM_QUEUES], pub(crate) device_state: DeviceState, - pub(crate) irq_trigger: IrqTrigger, // Implementation specific fields. pub(crate) restored_from_file: bool, @@ -176,29 +177,6 @@ pub struct Balloon { pub(crate) pfn_buffer: [u32; MAX_PAGE_COMPACT_BUFFER], } -// TODO Use `#[derive(Debug)]` when a new release of -// [rust-timerfd](https://github.com/main--/rust-timerfd) is published that includes -// https://github.com/main--/rust-timerfd/pull/12. -impl fmt::Debug for Balloon { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("Balloon") - .field("avail_features", &self.avail_features) - .field("acked_features", &self.acked_features) - .field("config_space", &self.config_space) - .field("activate_evt", &self.activate_evt) - .field("queues", &self.queues) - .field("queue_evts", &self.queue_evts) - .field("device_state", &self.device_state) - .field("irq_trigger", &self.irq_trigger) - .field("restored_from_file", &self.restored_from_file) - .field("stats_polling_interval_s", &self.stats_polling_interval_s) - .field("stats_desc_index", &self.stats_desc_index) - .field("latest_stats", &self.latest_stats) - .field("pfn_buffer", &self.pfn_buffer) - .finish() - } -} - impl Balloon { /// Instantiate a new balloon device. pub fn new( @@ -243,7 +221,6 @@ impl Balloon { }, queue_evts, queues, - irq_trigger: IrqTrigger::new(), device_state: DeviceState::Inactive, activate_evt: EventFd::new(libc::EFD_NONBLOCK).map_err(BalloonError::EventFd)?, restored_from_file, @@ -283,7 +260,7 @@ impl Balloon { pub(crate) fn process_inflate(&mut self) -> Result<(), BalloonError> { // This is safe since we checked in the event handler that the device is activated. - let mem = self.device_state.mem().unwrap(); + let mem = &self.device_state.active_state().unwrap().mem; METRICS.inflate_count.inc(); let queue = &mut self.queues[INFLATE_INDEX]; @@ -392,7 +369,7 @@ impl Balloon { pub(crate) fn process_stats_queue(&mut self) -> Result<(), BalloonError> { // This is safe since we checked in the event handler that the device is activated. - let mem = self.device_state.mem().unwrap(); + let mem = &self.device_state.active_state().unwrap().mem; METRICS.stats_updates_count.inc(); while let Some(head) = self.queues[STATS_INDEX].pop()? { @@ -427,10 +404,12 @@ impl Balloon { } pub(crate) fn signal_used_queue(&self) -> Result<(), BalloonError> { - self.irq_trigger.trigger_irq(IrqType::Vring).map_err(|err| { - METRICS.event_fails.inc(); - BalloonError::InterruptError(err) - }) + self.interrupt_trigger() + .trigger_irq(IrqType::Vring) + .map_err(|err| { + METRICS.event_fails.inc(); + BalloonError::InterruptError(err) + }) } /// Process device virtio queue(s). @@ -467,7 +446,7 @@ impl Balloon { pub fn update_size(&mut self, amount_mib: u32) -> Result<(), BalloonError> { if self.is_activated() { self.config_space.num_pages = mib_to_pages(amount_mib)?; - self.irq_trigger + self.interrupt_trigger() .trigger_irq(IrqType::Config) .map_err(BalloonError::InterruptError) } else { @@ -580,7 +559,11 @@ impl VirtioDevice for Balloon { } fn interrupt_trigger(&self) -> &IrqTrigger { - &self.irq_trigger + &self + .device_state + .active_state() + .expect("Device is not activated") + .interrupt } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -607,13 +590,17 @@ impl VirtioDevice for Balloon { dst.copy_from_slice(data); } - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate( + &mut self, + mem: GuestMemoryMmap, + interrupt: Arc, + ) -> Result<(), ActivateError> { for q in self.queues.iter_mut() { q.initialize(&mem) .map_err(ActivateError::QueueMemoryError)?; } - self.device_state = DeviceState::Activated(mem); + self.device_state = DeviceState::Activated(ActiveState { mem, interrupt }); if self.activate_evt.write(1).is_err() { METRICS.activate_fails.inc(); self.device_state = DeviceState::Inactive; @@ -642,7 +629,7 @@ pub(crate) mod tests { check_request_completion, invoke_handler_for_queue_event, set_request, }; use crate::devices::virtio::queue::{VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; - use crate::devices::virtio::test_utils::{VirtQueue, default_mem}; + use crate::devices::virtio::test_utils::{VirtQueue, default_interrupt, default_mem}; use crate::test_utils::single_region_mem; use crate::vstate::memory::GuestAddress; @@ -819,11 +806,12 @@ pub(crate) mod tests { fn test_invalid_request() { let mut balloon = Balloon::new(0, true, 0, false).unwrap(); let mem = default_mem(); + let interrupt = default_interrupt(); // Only initialize the inflate queue to demonstrate invalid request handling. let infq = VirtQueue::new(GuestAddress(0), &mem, 16); balloon.set_queue(INFLATE_INDEX, infq.create_queue()); balloon.set_queue(DEFLATE_INDEX, infq.create_queue()); - balloon.activate(mem.clone()).unwrap(); + balloon.activate(mem.clone(), interrupt).unwrap(); // Fill the second page with non-zero bytes. for i in 0..0x1000 { @@ -879,10 +867,11 @@ pub(crate) mod tests { fn test_inflate() { let mut balloon = Balloon::new(0, true, 0, false).unwrap(); let mem = default_mem(); + let interrupt = default_interrupt(); let infq = VirtQueue::new(GuestAddress(0), &mem, 16); balloon.set_queue(INFLATE_INDEX, infq.create_queue()); balloon.set_queue(DEFLATE_INDEX, infq.create_queue()); - balloon.activate(mem.clone()).unwrap(); + balloon.activate(mem.clone(), interrupt).unwrap(); // Fill the third page with non-zero bytes. for i in 0..0x1000 { @@ -950,10 +939,11 @@ pub(crate) mod tests { fn test_deflate() { let mut balloon = Balloon::new(0, true, 0, false).unwrap(); let mem = default_mem(); + let interrupt = default_interrupt(); let defq = VirtQueue::new(GuestAddress(0), &mem, 16); balloon.set_queue(INFLATE_INDEX, defq.create_queue()); balloon.set_queue(DEFLATE_INDEX, defq.create_queue()); - balloon.activate(mem.clone()).unwrap(); + balloon.activate(mem.clone(), interrupt).unwrap(); let page_addr = 0x10; @@ -999,11 +989,12 @@ pub(crate) mod tests { fn test_stats() { let mut balloon = Balloon::new(0, true, 1, false).unwrap(); let mem = default_mem(); + let interrupt = default_interrupt(); let statsq = VirtQueue::new(GuestAddress(0), &mem, 16); balloon.set_queue(INFLATE_INDEX, statsq.create_queue()); balloon.set_queue(DEFLATE_INDEX, statsq.create_queue()); balloon.set_queue(STATS_INDEX, statsq.create_queue()); - balloon.activate(mem.clone()).unwrap(); + balloon.activate(mem.clone(), interrupt).unwrap(); let page_addr = 0x100; @@ -1079,7 +1070,7 @@ pub(crate) mod tests { assert!(balloon.stats_desc_index.is_some()); balloon.process_stats_timer_event().unwrap(); assert!(balloon.stats_desc_index.is_none()); - assert!(balloon.irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(balloon.interrupt_trigger().has_pending_irq(IrqType::Vring)); }); } } @@ -1088,13 +1079,14 @@ pub(crate) mod tests { fn test_process_balloon_queues() { let mut balloon = Balloon::new(0x10, true, 0, false).unwrap(); let mem = default_mem(); + let interrupt = default_interrupt(); let infq = VirtQueue::new(GuestAddress(0), &mem, 16); let defq = VirtQueue::new(GuestAddress(0), &mem, 16); balloon.set_queue(INFLATE_INDEX, infq.create_queue()); balloon.set_queue(DEFLATE_INDEX, defq.create_queue()); - balloon.activate(mem).unwrap(); + balloon.activate(mem, interrupt).unwrap(); balloon.process_virtio_queues().unwrap(); } @@ -1105,7 +1097,8 @@ pub(crate) mod tests { let q = VirtQueue::new(GuestAddress(0), &mem, 16); balloon.set_queue(INFLATE_INDEX, q.create_queue()); balloon.set_queue(DEFLATE_INDEX, q.create_queue()); - balloon.activate(mem).unwrap(); + let interrupt = default_interrupt(); + balloon.activate(mem, interrupt).unwrap(); assert_eq!( format!("{:?}", balloon.update_stats_polling_interval(1)), "Err(StatisticsStateChange)" @@ -1118,7 +1111,8 @@ pub(crate) mod tests { balloon.set_queue(INFLATE_INDEX, q.create_queue()); balloon.set_queue(DEFLATE_INDEX, q.create_queue()); balloon.set_queue(STATS_INDEX, q.create_queue()); - balloon.activate(mem).unwrap(); + let interrupt = default_interrupt(); + balloon.activate(mem, interrupt).unwrap(); assert_eq!( format!("{:?}", balloon.update_stats_polling_interval(0)), "Err(StatisticsStateChange)" @@ -1138,7 +1132,10 @@ pub(crate) mod tests { fn test_num_pages() { let mut balloon = Balloon::new(0, true, 0, false).unwrap(); // Switch the state to active. - balloon.device_state = DeviceState::Activated(single_region_mem(0x1)); + balloon.device_state = DeviceState::Activated(ActiveState { + mem: single_region_mem(0x1), + interrupt: default_interrupt(), + }); assert_eq!(balloon.num_pages(), 0); assert_eq!(balloon.actual_pages(), 0); diff --git a/src/vmm/src/devices/virtio/balloon/event_handler.rs b/src/vmm/src/devices/virtio/balloon/event_handler.rs index 4e311edc045..3922b4b8385 100644 --- a/src/vmm/src/devices/virtio/balloon/event_handler.rs +++ b/src/vmm/src/devices/virtio/balloon/event_handler.rs @@ -136,7 +136,7 @@ pub mod tests { use super::*; use crate::devices::virtio::balloon::test_utils::set_request; - use crate::devices::virtio::test_utils::{VirtQueue, default_mem}; + use crate::devices::virtio::test_utils::{VirtQueue, default_interrupt, default_mem}; use crate::vstate::memory::GuestAddress; #[test] @@ -144,6 +144,7 @@ pub mod tests { let mut event_manager = EventManager::new().unwrap(); let mut balloon = Balloon::new(0, true, 10, false).unwrap(); let mem = default_mem(); + let interrupt = default_interrupt(); let infq = VirtQueue::new(GuestAddress(0), &mem, 16); balloon.set_queue(INFLATE_INDEX, infq.create_queue()); balloon.set_queue(DEFLATE_INDEX, infq.create_queue()); @@ -179,7 +180,11 @@ pub mod tests { } // Now activate the device. - balloon.lock().unwrap().activate(mem.clone()).unwrap(); + balloon + .lock() + .unwrap() + .activate(mem.clone(), interrupt) + .unwrap(); // Process the activate event. let ev_count = event_manager.run_with_timeout(50).unwrap(); assert_eq!(ev_count, 1); diff --git a/src/vmm/src/devices/virtio/balloon/mod.rs b/src/vmm/src/devices/virtio/balloon/mod.rs index 5af1e17288a..3f3e9346545 100644 --- a/src/vmm/src/devices/virtio/balloon/mod.rs +++ b/src/vmm/src/devices/virtio/balloon/mod.rs @@ -81,7 +81,7 @@ pub enum BalloonError { MalformedPayload, /// Error restoring the balloon device queues. QueueRestoreError, - /// Received stats querry when stats are disabled. + /// Received stats query when stats are disabled. StatisticsDisabled, /// Statistics cannot be enabled/disabled after activation. StatisticsStateChange, diff --git a/src/vmm/src/devices/virtio/balloon/persist.rs b/src/vmm/src/devices/virtio/balloon/persist.rs index 004fa27f8ca..397dd8aeb3e 100644 --- a/src/vmm/src/devices/virtio/balloon/persist.rs +++ b/src/vmm/src/devices/virtio/balloon/persist.rs @@ -4,7 +4,6 @@ //! Defines the structures needed for saving/restoring balloon devices. use std::sync::Arc; -use std::sync::atomic::AtomicU32; use std::time::Duration; use serde::{Deserialize, Serialize}; @@ -13,9 +12,10 @@ use timerfd::{SetTimeFlags, TimerState}; use super::*; use crate::devices::virtio::TYPE_BALLOON; use crate::devices::virtio::balloon::device::{BalloonStats, ConfigSpace}; -use crate::devices::virtio::device::DeviceState; +use crate::devices::virtio::device::{ActiveState, DeviceState}; use crate::devices::virtio::persist::VirtioDeviceState; use crate::devices::virtio::queue::FIRECRACKER_MAX_QUEUE_SIZE; +use crate::devices::virtio::transport::mmio::IrqTrigger; use crate::snapshot::Persist; use crate::vstate::memory::GuestMemoryMmap; @@ -95,6 +95,8 @@ pub struct BalloonState { pub struct BalloonConstructorArgs { /// Pointer to guest memory. pub mem: GuestMemoryMmap, + /// Interrupt used from the device. + pub interrupt: Arc, pub restored_from_file: bool, } @@ -144,8 +146,6 @@ impl Persist<'_> for Balloon { FIRECRACKER_MAX_QUEUE_SIZE, ) .map_err(|_| Self::Error::QueueRestoreError)?; - balloon.irq_trigger.irq_status = - Arc::new(AtomicU32::new(state.virtio_state.interrupt_status)); balloon.avail_features = state.virtio_state.avail_features; balloon.acked_features = state.virtio_state.acked_features; balloon.latest_stats = state.latest_stats.create_stats(); @@ -155,7 +155,10 @@ impl Persist<'_> for Balloon { }; if state.virtio_state.activated { - balloon.device_state = DeviceState::Activated(constructor_args.mem); + balloon.device_state = DeviceState::Activated(ActiveState { + mem: constructor_args.mem, + interrupt: constructor_args.interrupt, + }); if balloon.stats_enabled() { // Restore the stats descriptor. @@ -178,12 +181,11 @@ impl Persist<'_> for Balloon { #[cfg(test)] mod tests { - use std::sync::atomic::Ordering; use super::*; use crate::devices::virtio::TYPE_BALLOON; use crate::devices::virtio::device::VirtioDevice; - use crate::devices::virtio::test_utils::default_mem; + use crate::devices::virtio::test_utils::{default_interrupt, default_mem}; use crate::snapshot::Snapshot; #[test] @@ -200,6 +202,7 @@ mod tests { let restored_balloon = Balloon::restore( BalloonConstructorArgs { mem: guest_mem, + interrupt: default_interrupt(), restored_from_file: true, }, &Snapshot::deserialize(&mut mem.as_slice()).unwrap(), @@ -213,11 +216,8 @@ mod tests { assert_eq!(restored_balloon.avail_features, balloon.avail_features); assert_eq!(restored_balloon.config_space, balloon.config_space); assert_eq!(restored_balloon.queues(), balloon.queues()); - assert_eq!( - restored_balloon.interrupt_status().load(Ordering::Relaxed), - balloon.interrupt_status().load(Ordering::Relaxed) - ); - assert_eq!(restored_balloon.is_activated(), balloon.is_activated()); + assert!(!restored_balloon.is_activated()); + assert!(!balloon.is_activated()); assert_eq!( restored_balloon.stats_polling_interval_s, diff --git a/src/vmm/src/devices/virtio/balloon/test_utils.rs b/src/vmm/src/devices/virtio/balloon/test_utils.rs index 69b0b4f92a0..e588abaedee 100644 --- a/src/vmm/src/devices/virtio/balloon/test_utils.rs +++ b/src/vmm/src/devices/virtio/balloon/test_utils.rs @@ -10,6 +10,7 @@ use crate::devices::virtio::{balloon::BALLOON_NUM_QUEUES, balloon::Balloon}; #[cfg(test)] pub fn invoke_handler_for_queue_event(b: &mut Balloon, queue_index: usize) { use crate::devices::virtio::balloon::{DEFLATE_INDEX, INFLATE_INDEX, STATS_INDEX}; + use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::transport::mmio::IrqType; assert!(queue_index < BALLOON_NUM_QUEUES); @@ -23,7 +24,8 @@ pub fn invoke_handler_for_queue_event(b: &mut Balloon, queue_index: usize) { _ => unreachable!(), }; // Validate the queue operation finished successfully. - assert!(b.irq_trigger.has_pending_irq(IrqType::Vring)); + let interrupt = b.interrupt_trigger(); + assert!(interrupt.has_pending_irq(IrqType::Vring)); } pub fn set_request(queue: &VirtQueue, idx: u16, addr: u64, len: u32, flags: u16) { diff --git a/src/vmm/src/devices/virtio/block/device.rs b/src/vmm/src/devices/virtio/block/device.rs index 4f4676a24a8..5a491c537c5 100644 --- a/src/vmm/src/devices/virtio/block/device.rs +++ b/src/vmm/src/devices/virtio/block/device.rs @@ -1,6 +1,8 @@ // Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +use std::sync::Arc; + use event_manager::{EventOps, Events, MutEventSubscriber}; use vmm_sys_util::eventfd::EventFd; @@ -176,8 +178,8 @@ impl VirtioDevice for Block { fn interrupt_trigger(&self) -> &IrqTrigger { match self { - Self::Virtio(b) => &b.irq_trigger, - Self::VhostUser(b) => &b.irq_trigger, + Self::Virtio(b) => b.interrupt_trigger(), + Self::VhostUser(b) => b.interrupt_trigger(), } } @@ -195,10 +197,14 @@ impl VirtioDevice for Block { } } - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate( + &mut self, + mem: GuestMemoryMmap, + interrupt: Arc, + ) -> Result<(), ActivateError> { match self { - Self::Virtio(b) => b.activate(mem), - Self::VhostUser(b) => b.activate(mem), + Self::Virtio(b) => b.activate(mem, interrupt), + Self::VhostUser(b) => b.activate(mem, interrupt), } } diff --git a/src/vmm/src/devices/virtio/block/persist.rs b/src/vmm/src/devices/virtio/block/persist.rs index 2d83c416d9f..e7ae1768cca 100644 --- a/src/vmm/src/devices/virtio/block/persist.rs +++ b/src/vmm/src/devices/virtio/block/persist.rs @@ -1,10 +1,13 @@ // Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +use std::sync::Arc; + use serde::{Deserialize, Serialize}; use super::vhost_user::persist::VhostUserBlockState; use super::virtio::persist::VirtioBlockState; +use crate::devices::virtio::transport::mmio::IrqTrigger; use crate::vstate::memory::GuestMemoryMmap; /// Block device state. @@ -18,4 +21,5 @@ pub enum BlockState { #[derive(Debug)] pub struct BlockConstructorArgs { pub mem: GuestMemoryMmap, + pub interrupt: Arc, } diff --git a/src/vmm/src/devices/virtio/block/vhost_user/device.rs b/src/vmm/src/devices/virtio/block/vhost_user/device.rs index 264db2fa7f0..22429996d5f 100644 --- a/src/vmm/src/devices/virtio/block/vhost_user/device.rs +++ b/src/vmm/src/devices/virtio/block/vhost_user/device.rs @@ -14,6 +14,7 @@ use vmm_sys_util::eventfd::EventFd; use super::{NUM_QUEUES, QUEUE_SIZE, VhostUserBlockError}; use crate::devices::virtio::block::CacheType; +use crate::devices::virtio::device::ActiveState; use crate::devices::virtio::device::{DeviceState, VirtioDevice}; use crate::devices::virtio::generated::virtio_blk::{VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_RO}; use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; @@ -118,7 +119,6 @@ pub struct VhostUserBlockImpl { pub queues: Vec, pub queue_evts: [EventFd; u64_to_usize(NUM_QUEUES)], pub device_state: DeviceState, - pub irq_trigger: IrqTrigger, // Implementation specific fields. pub id: String, @@ -144,7 +144,6 @@ impl std::fmt::Debug for VhostUserBlockImpl { .field("queues", &self.queues) .field("queue_evts", &self.queue_evts) .field("device_state", &self.device_state) - .field("irq_trigger", &self.irq_trigger) .field("id", &self.id) .field("partuuid", &self.partuuid) .field("cache_type", &self.cache_type) @@ -204,7 +203,6 @@ impl VhostUserBlockImpl { let queue_evts = [EventFd::new(libc::EFD_NONBLOCK).map_err(VhostUserBlockError::EventFd)?; u64_to_usize(NUM_QUEUES)]; let device_state = DeviceState::Inactive; - let irq_trigger = IrqTrigger::new(); // We negotiated features with backend. Now these acked_features // are available for guest driver to choose from. @@ -226,7 +224,6 @@ impl VhostUserBlockImpl { queues, queue_evts, device_state, - irq_trigger, id: config.drive_id, partuuid: config.partuuid, @@ -257,6 +254,12 @@ impl VhostUserBlockImpl { pub fn config_update(&mut self) -> Result<(), VhostUserBlockError> { let start_time = get_time_us(ClockType::Monotonic); + let interrupt = self + .device_state + .active_state() + .expect("Device is not initialized") + .interrupt + .clone(); // This buffer is used for config size check in vhost crate. let buffer = [0u8; BLOCK_CONFIG_SPACE_SIZE as usize]; @@ -271,7 +274,7 @@ impl VhostUserBlockImpl { ) .map_err(VhostUserBlockError::Vhost)?; self.config_space = new_config_space; - self.irq_trigger + interrupt .trigger_irq(IrqType::Config) .map_err(VhostUserBlockError::IrqTrigger)?; @@ -312,7 +315,11 @@ impl VirtioDevice for VhostUserBlock } fn interrupt_trigger(&self) -> &IrqTrigger { - &self.irq_trigger + &self + .device_state + .active_state() + .expect("Device is not initialized") + .interrupt } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -331,7 +338,11 @@ impl VirtioDevice for VhostUserBlock // Other block config fields are immutable. } - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate( + &mut self, + mem: GuestMemoryMmap, + interrupt: Arc, + ) -> Result<(), ActivateError> { for q in self.queues.iter_mut() { q.initialize(&mem) .map_err(ActivateError::QueueMemoryError)?; @@ -346,14 +357,14 @@ impl VirtioDevice for VhostUserBlock self.vu_handle.setup_backend( &mem, &[(0, &self.queues[0], &self.queue_evts[0])], - &self.irq_trigger, + &interrupt, ) }) .map_err(|err| { self.metrics.activate_fails.inc(); ActivateError::VhostUser(err) })?; - self.device_state = DeviceState::Activated(mem); + self.device_state = DeviceState::Activated(ActiveState { mem, interrupt }); let delta_us = get_time_us(ClockType::Monotonic) - start_time; self.metrics.activate_time_us.store(delta_us); Ok(()) @@ -376,7 +387,7 @@ mod tests { use super::*; use crate::devices::virtio::block::virtio::device::FileEngineType; - use crate::devices::virtio::test_utils::VirtQueue; + use crate::devices::virtio::test_utils::{VirtQueue, default_interrupt, default_mem}; use crate::devices::virtio::transport::mmio::VIRTIO_MMIO_INT_CONFIG; use crate::devices::virtio::vhost_user::tests::create_mem; use crate::test_utils::create_tmp_socket; @@ -653,6 +664,10 @@ mod tests { assert_eq!(vhost_block.config_space, vec![0x69, 0x69, 0x69]); // Testing [`config_update`] + vhost_block.device_state = DeviceState::Activated(ActiveState { + mem: default_mem(), + interrupt: default_interrupt(), + }); vhost_block.config_space = vec![]; vhost_block.config_update().unwrap(); assert_eq!(vhost_block.config_space, vec![0x69, 0x69, 0x69]); @@ -784,9 +799,10 @@ mod tests { let guest_memory = create_mem(file, ®ions); let q = VirtQueue::new(GuestAddress(0), &guest_memory, 16); vhost_block.queues[0] = q.create_queue(); + let interrupt = default_interrupt(); // During actiavion of the device features, memory and queues should be set and activated. - vhost_block.activate(guest_memory).unwrap(); + vhost_block.activate(guest_memory, interrupt).unwrap(); assert!(unsafe { *vhost_block.vu_handle.vu.features_are_set.get() }); assert!(unsafe { *vhost_block.vu_handle.vu.memory_is_set.get() }); assert!(unsafe { *vhost_block.vu_handle.vu.vring_enabled.get() }); diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs index e89443e5bd9..413410f2af6 100644 --- a/src/vmm/src/devices/virtio/block/virtio/device.rs +++ b/src/vmm/src/devices/virtio/block/virtio/device.rs @@ -23,7 +23,7 @@ use super::request::*; use super::{BLOCK_QUEUE_SIZES, SECTOR_SHIFT, SECTOR_SIZE, VirtioBlockError, io as block_io}; use crate::devices::virtio::block::CacheType; use crate::devices::virtio::block::virtio::metrics::{BlockDeviceMetrics, BlockMetricsPerDevice}; -use crate::devices::virtio::device::{DeviceState, VirtioDevice}; +use crate::devices::virtio::device::{ActiveState, DeviceState, VirtioDevice}; use crate::devices::virtio::generated::virtio_blk::{ VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_RO, VIRTIO_BLK_ID_BYTES, }; @@ -250,7 +250,6 @@ pub struct VirtioBlock { pub queues: Vec, pub queue_evts: [EventFd; 1], pub device_state: DeviceState, - pub irq_trigger: IrqTrigger, // Implementation specific fields. pub id: String, @@ -323,7 +322,6 @@ impl VirtioBlock { queues, queue_evts, device_state: DeviceState::Inactive, - irq_trigger: IrqTrigger::new(), id: config.drive_id.clone(), partuuid: config.partuuid, @@ -388,34 +386,40 @@ impl VirtioBlock { /// Device specific function for peaking inside a queue and processing descriptors. pub fn process_queue(&mut self, queue_index: usize) -> Result<(), InvalidAvailIdx> { // This is safe since we checked in the event handler that the device is activated. - let mem = self.device_state.mem().unwrap(); + let active_state = self.device_state.active_state().unwrap(); let queue = &mut self.queues[queue_index]; let mut used_any = false; while let Some(head) = queue.pop_or_enable_notification()? { self.metrics.remaining_reqs_count.add(queue.len().into()); - let processing_result = match Request::parse(&head, mem, self.disk.nsectors) { - Ok(request) => { - if request.rate_limit(&mut self.rate_limiter) { - // Stop processing the queue and return this descriptor chain to the - // avail ring, for later processing. - queue.undo_pop(); - self.metrics.rate_limiter_throttled_events.inc(); - break; + let processing_result = + match Request::parse(&head, &active_state.mem, self.disk.nsectors) { + Ok(request) => { + if request.rate_limit(&mut self.rate_limiter) { + // Stop processing the queue and return this descriptor chain to the + // avail ring, for later processing. + queue.undo_pop(); + self.metrics.rate_limiter_throttled_events.inc(); + break; + } + + request.process( + &mut self.disk, + head.index, + &active_state.mem, + &self.metrics, + ) } - - request.process(&mut self.disk, head.index, mem, &self.metrics) - } - Err(err) => { - error!("Failed to parse available descriptor chain: {:?}", err); - self.metrics.execute_fails.inc(); - ProcessingResult::Executed(FinishedRequest { - num_bytes_to_mem: 0, - desc_idx: head.index, - }) - } - }; + Err(err) => { + error!("Failed to parse available descriptor chain: {:?}", err); + self.metrics.execute_fails.inc(); + ProcessingResult::Executed(FinishedRequest { + num_bytes_to_mem: 0, + desc_idx: head.index, + }) + } + }; match processing_result { ProcessingResult::Submitted => {} @@ -440,7 +444,8 @@ impl VirtioBlock { queue.advance_used_ring_idx(); if used_any && queue.prepare_kick() { - self.irq_trigger + active_state + .interrupt .trigger_irq(IrqType::Vring) .unwrap_or_else(|_| { self.metrics.event_fails.inc(); @@ -464,11 +469,11 @@ impl VirtioBlock { let engine = unwrap_async_file_engine_or_return!(&mut self.disk.file_engine); // This is safe since we checked in the event handler that the device is activated. - let mem = self.device_state.mem().unwrap(); + let active_state = self.device_state.active_state().unwrap(); let queue = &mut self.queues[0]; loop { - match engine.pop(mem) { + match engine.pop(&active_state.mem) { Err(error) => { error!("Failed to read completed io_uring entry: {:?}", error); break; @@ -487,7 +492,7 @@ impl VirtioBlock { ))), ), }; - let finished = pending.finish(mem, res, &self.metrics); + let finished = pending.finish(&active_state.mem, res, &self.metrics); queue .add_used(finished.desc_idx, finished.num_bytes_to_mem) .unwrap_or_else(|err| { @@ -502,7 +507,8 @@ impl VirtioBlock { queue.advance_used_ring_idx(); if queue.prepare_kick() { - self.irq_trigger + active_state + .interrupt .trigger_irq(IrqType::Vring) .unwrap_or_else(|_| { self.metrics.event_fails.inc(); @@ -531,7 +537,9 @@ impl VirtioBlock { self.config_space.capacity = self.disk.nsectors.to_le(); // virtio_block_config_space(); // Kick the driver to pick up the changes. - self.irq_trigger.trigger_irq(IrqType::Config).unwrap(); + self.interrupt_trigger() + .trigger_irq(IrqType::Config) + .unwrap(); self.metrics.update_count.inc(); Ok(()) @@ -599,7 +607,11 @@ impl VirtioDevice for VirtioBlock { } fn interrupt_trigger(&self) -> &IrqTrigger { - &self.irq_trigger + &self + .device_state + .active_state() + .expect("Device is not initialized") + .interrupt } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -628,7 +640,11 @@ impl VirtioDevice for VirtioBlock { dst.copy_from_slice(data); } - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate( + &mut self, + mem: GuestMemoryMmap, + interrupt: Arc, + ) -> Result<(), ActivateError> { for q in self.queues.iter_mut() { q.initialize(&mem) .map_err(ActivateError::QueueMemoryError)?; @@ -645,7 +661,7 @@ impl VirtioDevice for VirtioBlock { self.metrics.activate_fails.inc(); return Err(ActivateError::EventFd); } - self.device_state = DeviceState::Activated(mem); + self.device_state = DeviceState::Activated(ActiveState { mem, interrupt }); Ok(()) } @@ -688,7 +704,7 @@ mod tests { simulate_queue_event, }; use crate::devices::virtio::queue::{VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; - use crate::devices::virtio::test_utils::{VirtQueue, default_mem}; + use crate::devices::virtio::test_utils::{VirtQueue, default_interrupt, default_mem}; use crate::rate_limiter::TokenType; use crate::vstate::memory::{Address, Bytes, GuestAddress}; @@ -863,9 +879,10 @@ mod tests { for engine in [FileEngineType::Sync, FileEngineType::Async] { let mut block = default_block(engine); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -891,9 +908,10 @@ mod tests { let mut block = default_block(engine); // Default mem size is 0x10000 let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -954,9 +972,10 @@ mod tests { for engine in [FileEngineType::Sync, FileEngineType::Async] { let mut block = default_block(engine); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -1005,9 +1024,10 @@ mod tests { for engine in [FileEngineType::Sync, FileEngineType::Async] { let mut block = default_block(engine); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -1037,9 +1057,10 @@ mod tests { for engine in [FileEngineType::Sync, FileEngineType::Async] { let mut block = default_block(engine); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); read_blk_req_descriptors(&vq); vq.dtable[1].set(0xf000, 0x1000, VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE, 2); @@ -1073,9 +1094,10 @@ mod tests { for engine in [FileEngineType::Sync, FileEngineType::Async] { let mut block = default_block(engine); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -1120,9 +1142,10 @@ mod tests { // Default mem size is 0x10000 let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -1359,9 +1382,10 @@ mod tests { { // Default mem size is 0x10000 let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); read_blk_req_descriptors(&vq); vq.dtable[1].set(0xff00, 0x1000, VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE, 2); @@ -1400,9 +1424,10 @@ mod tests { for engine in [FileEngineType::Sync, FileEngineType::Async] { let mut block = default_block(engine); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -1446,9 +1471,10 @@ mod tests { for engine in [FileEngineType::Sync, FileEngineType::Async] { let mut block = default_block(engine); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -1570,9 +1596,10 @@ mod tests { let mut block = default_block(FileEngineType::Async); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, IO_URING_NUM_ENTRIES * 4); block.queues[0] = vq.create_queue(); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); // Run scenario that doesn't trigger FullSq BlockError: Add sq_size flush requests. add_flush_requests_batch(&mut block, &vq, IO_URING_NUM_ENTRIES); @@ -1604,9 +1631,10 @@ mod tests { let mut block = default_block(FileEngineType::Async); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, IO_URING_NUM_ENTRIES * 4); block.queues[0] = vq.create_queue(); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); // Run scenario that triggers FullCqError. Push 2 * IO_URING_NUM_ENTRIES and wait for // completion. Then try to push another entry. @@ -1634,9 +1662,10 @@ mod tests { let mut block = default_block(engine); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); block.queues[0] = vq.create_queue(); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); // Add a batch of flush requests. add_flush_requests_batch(&mut block, &vq, 5); @@ -1653,9 +1682,10 @@ mod tests { for engine in [FileEngineType::Sync, FileEngineType::Async] { let mut block = default_block(engine); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -1722,9 +1752,10 @@ mod tests { for engine in [FileEngineType::Sync, FileEngineType::Async] { let mut block = default_block(engine); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); - block.activate(mem.clone()).unwrap(); + block.activate(mem.clone(), interrupt).unwrap(); read_blk_req_descriptors(&vq); let request_type_addr = GuestAddress(vq.dtable[0].addr.get()); @@ -1804,6 +1835,11 @@ mod tests { fn test_update_disk_image() { for engine in [FileEngineType::Sync, FileEngineType::Async] { let mut block = default_block(engine); + let mem = default_mem(); + let interrupt = default_interrupt(); + let vq = VirtQueue::new(GuestAddress(0), &mem, 16); + set_queue(&mut block, 0, vq.create_queue()); + block.activate(mem, interrupt).unwrap(); let f = TempFile::new().unwrap(); let path = f.as_path(); let mdata = metadata(path).unwrap(); diff --git a/src/vmm/src/devices/virtio/block/virtio/event_handler.rs b/src/vmm/src/devices/virtio/block/virtio/event_handler.rs index db69e23d7f0..03c09a01972 100644 --- a/src/vmm/src/devices/virtio/block/virtio/event_handler.rs +++ b/src/vmm/src/devices/virtio/block/virtio/event_handler.rs @@ -124,7 +124,7 @@ mod tests { }; use crate::devices::virtio::block::virtio::{VIRTIO_BLK_S_OK, VIRTIO_BLK_T_OUT}; use crate::devices::virtio::queue::VIRTQ_DESC_F_NEXT; - use crate::devices::virtio::test_utils::{VirtQueue, default_mem}; + use crate::devices::virtio::test_utils::{VirtQueue, default_interrupt, default_mem}; use crate::vstate::memory::{Bytes, GuestAddress}; #[test] @@ -132,6 +132,7 @@ mod tests { let mut event_manager = EventManager::new().unwrap(); let mut block = default_block(FileEngineType::default()); let mem = default_mem(); + let interrupt = default_interrupt(); let vq = VirtQueue::new(GuestAddress(0), &mem, 16); set_queue(&mut block, 0, vq.create_queue()); read_blk_req_descriptors(&vq); @@ -162,7 +163,11 @@ mod tests { assert_eq!(ev_count, 0); // Now activate the device. - block.lock().unwrap().activate(mem.clone()).unwrap(); + block + .lock() + .unwrap() + .activate(mem.clone(), interrupt) + .unwrap(); // Process the activate event. let ev_count = event_manager.run_with_timeout(50).unwrap(); assert_eq!(ev_count, 1); diff --git a/src/vmm/src/devices/virtio/block/virtio/persist.rs b/src/vmm/src/devices/virtio/block/virtio/persist.rs index 33a33968e53..57e4a11b9c1 100644 --- a/src/vmm/src/devices/virtio/block/virtio/persist.rs +++ b/src/vmm/src/devices/virtio/block/virtio/persist.rs @@ -3,9 +3,6 @@ //! Defines the structures needed for saving/restoring block devices. -use std::sync::Arc; -use std::sync::atomic::AtomicU32; - use device::ConfigSpace; use serde::{Deserialize, Serialize}; use vmm_sys_util::eventfd::EventFd; @@ -16,10 +13,9 @@ use crate::devices::virtio::TYPE_BLOCK; use crate::devices::virtio::block::persist::BlockConstructorArgs; use crate::devices::virtio::block::virtio::device::FileEngineType; use crate::devices::virtio::block::virtio::metrics::BlockMetricsPerDevice; -use crate::devices::virtio::device::DeviceState; +use crate::devices::virtio::device::{ActiveState, DeviceState}; use crate::devices::virtio::generated::virtio_blk::VIRTIO_BLK_F_RO; use crate::devices::virtio::persist::VirtioDeviceState; -use crate::devices::virtio::transport::mmio::IrqTrigger; use crate::rate_limiter::RateLimiter; use crate::rate_limiter::persist::RateLimiterState; use crate::snapshot::Persist; @@ -112,14 +108,14 @@ impl Persist<'_> for VirtioBlock { ) .map_err(VirtioBlockError::Persist)?; - let mut irq_trigger = IrqTrigger::new(); - irq_trigger.irq_status = Arc::new(AtomicU32::new(state.virtio_state.interrupt_status)); - let avail_features = state.virtio_state.avail_features; let acked_features = state.virtio_state.acked_features; let device_state = if state.virtio_state.activated { - DeviceState::Activated(constructor_args.mem) + DeviceState::Activated(ActiveState { + mem: constructor_args.mem, + interrupt: constructor_args.interrupt, + }) } else { DeviceState::Inactive }; @@ -137,7 +133,6 @@ impl Persist<'_> for VirtioBlock { queues, queue_evts, device_state, - irq_trigger, id: state.id.clone(), partuuid: state.partuuid.clone(), @@ -155,14 +150,12 @@ impl Persist<'_> for VirtioBlock { #[cfg(test)] mod tests { - use std::sync::atomic::Ordering; - use vmm_sys_util::tempfile::TempFile; use super::*; use crate::devices::virtio::block::virtio::device::VirtioBlockConfig; use crate::devices::virtio::device::VirtioDevice; - use crate::devices::virtio::test_utils::default_mem; + use crate::devices::virtio::test_utils::{default_interrupt, default_mem}; use crate::snapshot::Snapshot; #[test] @@ -234,7 +227,10 @@ mod tests { // Restore the block device. let restored_block = VirtioBlock::restore( - BlockConstructorArgs { mem: guest_mem }, + BlockConstructorArgs { + mem: guest_mem, + interrupt: default_interrupt(), + }, &Snapshot::deserialize(&mut mem.as_slice()).unwrap(), ) .unwrap(); @@ -244,11 +240,8 @@ mod tests { assert_eq!(restored_block.avail_features(), block.avail_features()); assert_eq!(restored_block.acked_features(), block.acked_features()); assert_eq!(restored_block.queues(), block.queues()); - assert_eq!( - restored_block.interrupt_status().load(Ordering::Relaxed), - block.interrupt_status().load(Ordering::Relaxed) - ); - assert_eq!(restored_block.is_activated(), block.is_activated()); + assert!(!block.is_activated()); + assert!(!restored_block.is_activated()); // Test that block specific fields are the same. assert_eq!(restored_block.disk.file_path, block.disk.file_path); diff --git a/src/vmm/src/devices/virtio/block/virtio/test_utils.rs b/src/vmm/src/devices/virtio/block/virtio/test_utils.rs index b05e899f32d..14e2f1d33d0 100644 --- a/src/vmm/src/devices/virtio/block/virtio/test_utils.rs +++ b/src/vmm/src/devices/virtio/block/virtio/test_utils.rs @@ -16,6 +16,8 @@ use crate::devices::virtio::block::virtio::device::FileEngineType; #[cfg(test)] use crate::devices::virtio::block::virtio::io::FileEngine; use crate::devices::virtio::block::virtio::{CacheType, VirtioBlock}; +#[cfg(test)] +use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::queue::{Queue, VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; use crate::devices::virtio::test_utils::{VirtQueue, VirtqDesc}; #[cfg(test)] @@ -82,7 +84,10 @@ pub fn simulate_queue_event(b: &mut VirtioBlock, maybe_expected_irq: Option, +} + /// Enum that indicates if a VirtioDevice is inactive or has been activated /// and memory attached to it. #[derive(Debug)] pub enum DeviceState { Inactive, - Activated(GuestMemoryMmap), + Activated(ActiveState), } impl DeviceState { @@ -35,10 +42,10 @@ impl DeviceState { } } - /// Gets the memory attached to the device if it is activated. - pub fn mem(&self) -> Option<&GuestMemoryMmap> { + /// Gets the memory and interrupt attached to the device if it is activated. + pub fn active_state(&self) -> Option<&ActiveState> { match self { - DeviceState::Activated(mem) => Some(mem), + DeviceState::Activated(state) => Some(state), DeviceState::Inactive => None, } } @@ -130,7 +137,11 @@ pub trait VirtioDevice: AsAny + Send { fn write_config(&mut self, offset: u64, data: &[u8]); /// Performs the formal activation for a device, which can be verified also with `is_activated`. - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError>; + fn activate( + &mut self, + mem: GuestMemoryMmap, + interrupt: Arc, + ) -> Result<(), ActivateError>; /// Checks if the resources of this device are activated. fn is_activated(&self) -> bool; @@ -206,7 +217,11 @@ pub(crate) mod tests { todo!() } - fn activate(&mut self, _mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate( + &mut self, + _mem: GuestMemoryMmap, + _interrupt: Arc, + ) -> Result<(), ActivateError> { todo!() } diff --git a/src/vmm/src/devices/virtio/net/device.rs b/src/vmm/src/devices/virtio/net/device.rs index 55a94636495..9949b404809 100755 --- a/src/vmm/src/devices/virtio/net/device.rs +++ b/src/vmm/src/devices/virtio/net/device.rs @@ -15,7 +15,7 @@ use log::error; use vmm_sys_util::eventfd::EventFd; use super::NET_QUEUE_MAX_SIZE; -use crate::devices::virtio::device::{DeviceState, VirtioDevice}; +use crate::devices::virtio::device::{ActiveState, DeviceState, VirtioDevice}; use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; use crate::devices::virtio::generated::virtio_net::{ VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, @@ -250,8 +250,6 @@ pub struct Net { tx_frame_headers: [u8; frame_hdr_len()], - pub(crate) irq_trigger: IrqTrigger, - pub(crate) config_space: ConfigSpace, pub(crate) guest_mac: Option, @@ -314,7 +312,6 @@ impl Net { tx_rate_limiter, rx_frame_buf: [0u8; MAX_BUFFER_SIZE], tx_frame_headers: [0u8; frame_hdr_len()], - irq_trigger: IrqTrigger::new(), config_space, guest_mac, device_state: DeviceState::Inactive, @@ -400,7 +397,7 @@ impl Net { queue.advance_used_ring_idx(); if queue.prepare_kick() { - self.irq_trigger + self.interrupt_trigger() .trigger_irq(IrqType::Vring) .map_err(|err| { self.metrics.event_fails.inc(); @@ -465,7 +462,7 @@ impl Net { /// Parse available RX `DescriptorChains` from the queue pub fn parse_rx_descriptors(&mut self) -> Result<(), InvalidAvailIdx> { // This is safe since we checked in the event handler that the device is activated. - let mem = self.device_state.mem().unwrap(); + let mem = &self.device_state.active_state().unwrap().mem; let queue = &mut self.queues[RX_INDEX]; while let Some(head) = queue.pop_or_enable_notification()? { let index = head.index; @@ -687,7 +684,7 @@ impl Net { fn process_tx(&mut self) -> Result<(), DeviceError> { // This is safe since we checked in the event handler that the device is activated. - let mem = self.device_state.mem().unwrap(); + let mem = &self.device_state.active_state().unwrap().mem; // The MMDS network stack works like a state machine, based on synchronous calls, and // without being added to any event loop. If any frame is accepted by the MMDS, we also @@ -970,8 +967,13 @@ impl VirtioDevice for Net { } fn interrupt_trigger(&self) -> &IrqTrigger { - &self.irq_trigger + &self + .device_state + .active_state() + .expect("Device is not implemented") + .interrupt } + fn read_config(&self, offset: u64, data: &mut [u8]) { if let Some(config_space_bytes) = self.config_space.as_slice().get(u64_to_usize(offset)..) { let len = config_space_bytes.len().min(data.len()); @@ -1000,7 +1002,11 @@ impl VirtioDevice for Net { self.metrics.mac_address_updates.inc(); } - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate( + &mut self, + mem: GuestMemoryMmap, + interrupt: Arc, + ) -> Result<(), ActivateError> { for q in self.queues.iter_mut() { q.initialize(&mem) .map_err(ActivateError::QueueMemoryError)?; @@ -1024,7 +1030,7 @@ impl VirtioDevice for Net { self.metrics.activate_fails.inc(); return Err(ActivateError::EventFd); } - self.device_state = DeviceState::Activated(mem); + self.device_state = DeviceState::Activated(ActiveState { mem, interrupt }); Ok(()) } @@ -1403,7 +1409,7 @@ pub mod tests { // Check that the used queue has advanced. assert_eq!(th.rxq.used.idx.get(), 4); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); // Check that the invalid descriptor chains have been discarded th.rxq.check_used_elem(0, 0, 0); th.rxq.check_used_elem(1, 3, 0); @@ -1460,7 +1466,7 @@ pub mod tests { assert!(th.net().rx_buffer.used_descriptors == 0); // Check that the used queue has advanced. assert_eq!(th.rxq.used.idx.get(), 1); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); // Check that the frame has been written successfully to the Rx descriptor chain. header_set_num_buffers(frame.as_mut_slice(), 1); th.rxq @@ -1523,7 +1529,7 @@ pub mod tests { assert!(th.net().rx_buffer.used_bytes == 0); // Check that the used queue has advanced. assert_eq!(th.rxq.used.idx.get(), 2); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); // Check that the 1st frame was written successfully to the 1st Rx descriptor chain. header_set_num_buffers(frame_1.as_mut_slice(), 1); th.rxq @@ -1581,7 +1587,7 @@ pub mod tests { assert!(th.net().rx_buffer.used_bytes == 0); // Check that the used queue has advanced. assert_eq!(th.rxq.used.idx.get(), 2); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); // 2 chains should be used for the packet. header_set_num_buffers(frame.as_mut_slice(), 2); @@ -1646,7 +1652,7 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); th.txq.check_used_elem(0, 0, 0); // Check that the frame was skipped. assert!(!tap_traffic_simulator.pop_rx_packet(&mut [])); @@ -1669,7 +1675,7 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); th.txq.check_used_elem(0, 0, 0); // Check that the frame was skipped. assert!(!tap_traffic_simulator.pop_rx_packet(&mut [])); @@ -1696,7 +1702,7 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); th.txq.check_used_elem(0, 0, 0); // Check that the frame was skipped. assert!(!tap_traffic_simulator.pop_rx_packet(&mut [])); @@ -1719,7 +1725,7 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); th.txq.check_used_elem(0, 0, 0); // Check that the frame was skipped. assert!(!tap_traffic_simulator.pop_rx_packet(&mut [])); @@ -1758,7 +1764,7 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 4); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); th.txq.check_used_elem(3, 4, 0); // Check that the valid frame was sent to the tap. let mut buf = vec![0; 1000]; @@ -1789,7 +1795,7 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); th.txq.check_used_elem(0, 3, 0); // Check that the frame was sent to the tap. let mut buf = vec![0; 1000]; @@ -1818,7 +1824,7 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); th.txq.check_used_elem(0, 0, 0); // dropping th would double close the tap fd, so leak it @@ -1849,7 +1855,7 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 2); - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); th.txq.check_used_elem(0, 0, 0); th.txq.check_used_elem(1, 3, 0); // Check that the first frame was sent to the tap. @@ -2201,7 +2207,7 @@ pub mod tests { assert_eq!(th.net().metrics.rx_rate_limiter_throttled.count(), 1); assert!(th.net().rx_buffer.used_descriptors != 0); // assert that no operation actually completed (limiter blocked it) - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); // make sure the data is still queued for processing assert_eq!(th.rxq.used.idx.get(), 0); } @@ -2229,7 +2235,7 @@ pub mod tests { // validate the rate_limiter is no longer blocked assert!(!th.net().rx_rate_limiter.is_blocked()); // make sure the virtio queue operation completed this time - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); // make sure the data queue advanced assert_eq!(th.rxq.used.idx.get(), 1); th.rxq @@ -2326,14 +2332,14 @@ pub mod tests { assert!(th.net().metrics.rx_rate_limiter_throttled.count() >= 1); assert!(th.net().rx_buffer.used_descriptors != 0); // assert that no operation actually completed (limiter blocked it) - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); // make sure the data is still queued for processing assert_eq!(th.rxq.used.idx.get(), 0); // trigger the RX handler again, this time it should do the limiter fast path exit th.simulate_event(NetEvent::Tap); // assert that no operation actually completed, that the limiter blocked it - assert!(!&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(!th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); // make sure the data is still queued for processing assert_eq!(th.rxq.used.idx.get(), 0); } @@ -2346,7 +2352,7 @@ pub mod tests { { th.simulate_event(NetEvent::RxRateLimiter); // make sure the virtio queue operation completed this time - assert!(&th.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); // make sure the data queue advanced assert_eq!(th.rxq.used.idx.get(), 1); th.rxq @@ -2416,7 +2422,7 @@ pub mod tests { assert_eq!(net.queue_events().len(), NET_QUEUE_SIZES.len()); // Test interrupts. - assert!(!&net.irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(!net.interrupt_trigger().has_pending_irq(IrqType::Vring)); } #[test] diff --git a/src/vmm/src/devices/virtio/net/persist.rs b/src/vmm/src/devices/virtio/net/persist.rs index 50e761273db..9072d3dd5e7 100644 --- a/src/vmm/src/devices/virtio/net/persist.rs +++ b/src/vmm/src/devices/virtio/net/persist.rs @@ -4,7 +4,6 @@ //! Defines the structures needed for saving/restoring net devices. use std::io; -use std::sync::atomic::AtomicU32; use std::sync::{Arc, Mutex}; use serde::{Deserialize, Serialize}; @@ -12,8 +11,9 @@ use serde::{Deserialize, Serialize}; use super::device::{Net, RxBuffers}; use super::{NET_NUM_QUEUES, NET_QUEUE_MAX_SIZE, RX_INDEX, TapError}; use crate::devices::virtio::TYPE_NET; -use crate::devices::virtio::device::DeviceState; +use crate::devices::virtio::device::{ActiveState, DeviceState}; use crate::devices::virtio::persist::{PersistError as VirtioStateError, VirtioDeviceState}; +use crate::devices::virtio::transport::mmio::IrqTrigger; use crate::mmds::data_store::Mmds; use crate::mmds::ns::MmdsNetworkStack; use crate::mmds::persist::MmdsNetworkStackState; @@ -71,6 +71,8 @@ pub struct NetState { pub struct NetConstructorArgs { /// Pointer to guest memory. pub mem: GuestMemoryMmap, + /// Interrupt for the device. + pub interrupt: Arc, /// Pointer to the MMDS data store. pub mmds: Option>>, } @@ -148,7 +150,6 @@ impl Persist<'_> for Net { NET_NUM_QUEUES, NET_QUEUE_MAX_SIZE, )?; - net.irq_trigger.irq_status = Arc::new(AtomicU32::new(state.virtio_state.interrupt_status)); net.avail_features = state.virtio_state.avail_features; net.acked_features = state.virtio_state.acked_features; @@ -158,7 +159,10 @@ impl Persist<'_> for Net { .set_offload(supported_flags) .map_err(NetPersistError::TapSetOffload)?; - net.device_state = DeviceState::Activated(constructor_args.mem); + net.device_state = DeviceState::Activated(ActiveState { + mem: constructor_args.mem, + interrupt: constructor_args.interrupt, + }); // Recreate `Net::rx_buffer`. We do it by re-parsing the RX queue. We're temporarily // rolling back `next_avail` in the RX queue and call `parse_rx_descriptors`. @@ -175,12 +179,11 @@ impl Persist<'_> for Net { #[cfg(test)] mod tests { - use std::sync::atomic::Ordering; use super::*; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::net::test_utils::{default_net, default_net_no_mmds}; - use crate::devices::virtio::test_utils::default_mem; + use crate::devices::virtio::test_utils::{default_interrupt, default_mem}; use crate::snapshot::Snapshot; fn validate_save_and_restore(net: Net, mmds_ds: Option>>) { @@ -213,6 +216,7 @@ mod tests { match Net::restore( NetConstructorArgs { mem: guest_mem, + interrupt: default_interrupt(), mmds: mmds_ds, }, &Snapshot::deserialize(&mut mem.as_slice()).unwrap(), @@ -222,10 +226,6 @@ mod tests { assert_eq!(restored_net.device_type(), TYPE_NET); assert_eq!(restored_net.avail_features(), virtio_state.avail_features); assert_eq!(restored_net.acked_features(), virtio_state.acked_features); - assert_eq!( - restored_net.interrupt_status().load(Ordering::Relaxed), - virtio_state.interrupt_status - ); assert_eq!(restored_net.is_activated(), virtio_state.activated); // Test that net specific fields are the same. diff --git a/src/vmm/src/devices/virtio/net/test_utils.rs b/src/vmm/src/devices/virtio/net/test_utils.rs index ec52883e979..c81ad58205c 100644 --- a/src/vmm/src/devices/virtio/net/test_utils.rs +++ b/src/vmm/src/devices/virtio/net/test_utils.rs @@ -299,7 +299,7 @@ pub mod test { }; use crate::devices::virtio::net::{MAX_BUFFER_SIZE, Net, RX_INDEX, TX_INDEX}; use crate::devices::virtio::queue::{VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; - use crate::devices::virtio::test_utils::{VirtQueue, VirtqDesc}; + use crate::devices::virtio::test_utils::{VirtQueue, VirtqDesc, default_interrupt}; use crate::devices::virtio::transport::mmio::IrqType; use crate::logger::IncMetric; use crate::vstate::memory::{Address, Bytes, GuestAddress, GuestMemoryMmap}; @@ -359,7 +359,12 @@ pub mod test { } pub fn activate_net(&mut self) { - self.net.lock().unwrap().activate(self.mem.clone()).unwrap(); + let interrupt = default_interrupt(); + self.net + .lock() + .unwrap() + .activate(self.mem.clone(), interrupt) + .unwrap(); // Process the activate event. let ev_count = self.event_manager.run_with_timeout(100).unwrap(); assert_eq!(ev_count, 1); @@ -436,7 +441,11 @@ pub mod test { old_used_descriptors + 1 ); - assert!(&self.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!( + self.net() + .interrupt_trigger() + .has_pending_irq(IrqType::Vring) + ); frame } @@ -462,7 +471,11 @@ pub mod test { ); // Check that the expected frame was sent to the Rx queue eventually. assert_eq!(self.rxq.used.idx.get(), used_idx + 1); - assert!(&self.net().irq_trigger.has_pending_irq(IrqType::Vring)); + assert!( + self.net() + .interrupt_trigger() + .has_pending_irq(IrqType::Vring) + ); self.rxq .check_used_elem(used_idx, 0, expected_frame.len().try_into().unwrap()); self.rxq.dtable[0].check_data(expected_frame); diff --git a/src/vmm/src/devices/virtio/persist.rs b/src/vmm/src/devices/virtio/persist.rs index 1a1eb6dba7d..776c7179048 100644 --- a/src/vmm/src/devices/virtio/persist.rs +++ b/src/vmm/src/devices/virtio/persist.rs @@ -10,6 +10,7 @@ use std::sync::{Arc, Mutex}; use serde::{Deserialize, Serialize}; use super::queue::{InvalidAvailIdx, QueueError}; +use super::transport::mmio::IrqTrigger; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::generated::virtio_ring::VIRTIO_RING_F_EVENT_IDX; use crate::devices::virtio::queue::Queue; @@ -123,8 +124,6 @@ pub struct VirtioDeviceState { pub acked_features: u64, /// List of queues. pub queues: Vec, - /// The MMIO interrupt status. - pub interrupt_status: u32, /// Flag for activated status. pub activated: bool, } @@ -137,7 +136,6 @@ impl VirtioDeviceState { avail_features: device.avail_features(), acked_features: device.acked_features(), queues: device.queues().iter().map(Persist::save).collect(), - interrupt_status: device.interrupt_status().load(Ordering::Relaxed), activated: device.is_activated(), } } @@ -202,6 +200,7 @@ pub struct MmioTransportState { queue_select: u32, device_status: u32, config_generation: u32, + interrupt_status: u32, } /// Auxiliary structure for initializing the transport when resuming from a snapshot. @@ -209,6 +208,8 @@ pub struct MmioTransportState { pub struct MmioTransportConstructorArgs { /// Pointer to guest memory. pub mem: GuestMemoryMmap, + /// Interrupt to use for the device + pub interrupt: Arc, /// Device associated with the current MMIO state. pub device: Arc>, /// Is device backed by vhost-user. @@ -227,6 +228,7 @@ impl Persist<'_> for MmioTransport { queue_select: self.queue_select, device_status: self.device_status, config_generation: self.config_generation, + interrupt_status: self.interrupt.irq_status.load(Ordering::SeqCst), } } @@ -236,6 +238,7 @@ impl Persist<'_> for MmioTransport { ) -> Result { let mut transport = MmioTransport::new( constructor_args.mem, + constructor_args.interrupt, constructor_args.device, constructor_args.is_vhost_user, ); @@ -244,6 +247,10 @@ impl Persist<'_> for MmioTransport { transport.queue_select = state.queue_select; transport.device_status = state.device_status; transport.config_generation = state.config_generation; + transport + .interrupt + .irq_status + .store(state.interrupt_status, Ordering::SeqCst); Ok(transport) } } @@ -383,7 +390,7 @@ mod tests { self.queue_select == other.queue_select && self.device_status == other.device_status && self.config_generation == other.config_generation && - self.interrupt_status.load(Ordering::SeqCst) == other.interrupt_status.load(Ordering::SeqCst) && + self.interrupt.irq_status.load(Ordering::SeqCst) == other.interrupt.irq_status.load(Ordering::SeqCst) && // Only checking equality of device type, actual device (de)ser is tested by that // device's tests. self_dev_type == other.device().lock().unwrap().device_type() @@ -392,6 +399,7 @@ mod tests { fn generic_mmiotransport_persistence_test( mmio_transport: MmioTransport, + interrupt: Arc, mem: GuestMemoryMmap, device: Arc>, ) { @@ -401,6 +409,7 @@ mod tests { let restore_args = MmioTransportConstructorArgs { mem, + interrupt, device, is_vhost_user: false, }; @@ -413,8 +422,14 @@ mod tests { assert_eq!(restored_mmio_transport, mmio_transport); } - fn create_default_block() -> (MmioTransport, GuestMemoryMmap, Arc>) { + fn create_default_block() -> ( + MmioTransport, + Arc, + GuestMemoryMmap, + Arc>, + ) { let mem = default_mem(); + let interrupt = Arc::new(IrqTrigger::new()); // Create backing file. let f = TempFile::new().unwrap(); @@ -424,25 +439,34 @@ mod tests { FileEngineType::default(), ); let block = Arc::new(Mutex::new(block)); - let mmio_transport = MmioTransport::new(mem.clone(), block.clone(), false); + let mmio_transport = + MmioTransport::new(mem.clone(), interrupt.clone(), block.clone(), false); - (mmio_transport, mem, block) + (mmio_transport, interrupt, mem, block) } - fn create_default_net() -> (MmioTransport, GuestMemoryMmap, Arc>) { + fn create_default_net() -> ( + MmioTransport, + Arc, + GuestMemoryMmap, + Arc>, + ) { let mem = default_mem(); + let interrupt = Arc::new(IrqTrigger::new()); let net = Arc::new(Mutex::new(default_net())); - let mmio_transport = MmioTransport::new(mem.clone(), net.clone(), false); + let mmio_transport = MmioTransport::new(mem.clone(), interrupt.clone(), net.clone(), false); - (mmio_transport, mem, net) + (mmio_transport, interrupt, mem, net) } fn default_vsock() -> ( MmioTransport, + Arc, GuestMemoryMmap, Arc>>, ) { let mem = default_mem(); + let interrupt = Arc::new(IrqTrigger::new()); let guest_cid = 52; let mut temp_uds_path = TempFile::new().unwrap(); @@ -452,26 +476,27 @@ mod tests { let backend = VsockUnixBackend::new(guest_cid, uds_path).unwrap(); let vsock = Vsock::new(guest_cid, backend).unwrap(); let vsock = Arc::new(Mutex::new(vsock)); - let mmio_transport = MmioTransport::new(mem.clone(), vsock.clone(), false); + let mmio_transport = + MmioTransport::new(mem.clone(), interrupt.clone(), vsock.clone(), false); - (mmio_transport, mem, vsock) + (mmio_transport, interrupt, mem, vsock) } #[test] fn test_block_over_mmiotransport_persistence() { - let (mmio_transport, mem, block) = create_default_block(); - generic_mmiotransport_persistence_test(mmio_transport, mem, block); + let (mmio_transport, interrupt, mem, block) = create_default_block(); + generic_mmiotransport_persistence_test(mmio_transport, interrupt, mem, block); } #[test] fn test_net_over_mmiotransport_persistence() { - let (mmio_transport, mem, net) = create_default_net(); - generic_mmiotransport_persistence_test(mmio_transport, mem, net); + let (mmio_transport, interrupt, mem, net) = create_default_net(); + generic_mmiotransport_persistence_test(mmio_transport, interrupt, mem, net); } #[test] fn test_vsock_over_mmiotransport_persistence() { - let (mmio_transport, mem, vsock) = default_vsock(); - generic_mmiotransport_persistence_test(mmio_transport, mem, vsock); + let (mmio_transport, interrupt, mem, vsock) = default_vsock(); + generic_mmiotransport_persistence_test(mmio_transport, interrupt, mem, vsock); } } diff --git a/src/vmm/src/devices/virtio/queue.rs b/src/vmm/src/devices/virtio/queue.rs index ec845fe6394..9977070293e 100644 --- a/src/vmm/src/devices/virtio/queue.rs +++ b/src/vmm/src/devices/virtio/queue.rs @@ -20,7 +20,7 @@ pub(super) const FIRECRACKER_MAX_QUEUE_SIZE: u16 = 256; // GuestMemoryMmap::read_obj_from_addr() will be used to fetch the descriptor, // which has an explicit constraint that the entire descriptor doesn't -// cross the page boundary. Otherwise the descriptor may be splitted into +// cross the page boundary. Otherwise the descriptor may be split into // two mmap regions which causes failure of GuestMemoryMmap::read_obj_from_addr(). // // The Virtio Spec 1.0 defines the alignment of VirtIO descriptor is 16 bytes, diff --git a/src/vmm/src/devices/virtio/rng/device.rs b/src/vmm/src/devices/virtio/rng/device.rs index d644161d87e..1433a7086e2 100644 --- a/src/vmm/src/devices/virtio/rng/device.rs +++ b/src/vmm/src/devices/virtio/rng/device.rs @@ -3,7 +3,6 @@ use std::io; use std::sync::Arc; -use std::sync::atomic::AtomicU32; use aws_lc_rs::rand; use vm_memory::GuestMemoryError; @@ -12,7 +11,7 @@ use vmm_sys_util::eventfd::EventFd; use super::metrics::METRICS; use super::{RNG_NUM_QUEUES, RNG_QUEUE}; use crate::devices::DeviceError; -use crate::devices::virtio::device::{DeviceState, VirtioDevice}; +use crate::devices::virtio::device::{ActiveState, DeviceState, VirtioDevice}; use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; use crate::devices::virtio::iov_deque::IovDequeError; use crate::devices::virtio::iovec::IoVecBufferMut; @@ -48,7 +47,6 @@ pub struct Entropy { device_state: DeviceState, pub(crate) queues: Vec, queue_events: Vec, - irq_trigger: IrqTrigger, // Device specific fields rate_limiter: RateLimiter, @@ -70,7 +68,6 @@ impl Entropy { let queue_events = (0..RNG_NUM_QUEUES) .map(|_| EventFd::new(libc::EFD_NONBLOCK)) .collect::, io::Error>>()?; - let irq_trigger = IrqTrigger::new(); Ok(Self { avail_features: 1 << VIRTIO_F_VERSION_1, @@ -79,7 +76,6 @@ impl Entropy { device_state: DeviceState::Inactive, queues, queue_events, - irq_trigger, rate_limiter, buffer: IoVecBufferMut::new()?, }) @@ -90,7 +86,7 @@ impl Entropy { } fn signal_used_queue(&self) -> Result<(), DeviceError> { - self.irq_trigger + self.interrupt_trigger() .trigger_irq(IrqType::Vring) .map_err(DeviceError::FailedSignalingIrq) } @@ -133,7 +129,7 @@ impl Entropy { let mut used_any = false; while let Some(desc) = self.queues[RNG_QUEUE].pop()? { // This is safe since we checked in the event handler that the device is activated. - let mem = self.device_state.mem().unwrap(); + let mem = &self.device_state.active_state().unwrap().mem; let index = desc.index; METRICS.entropy_event_count.inc(); @@ -240,12 +236,8 @@ impl Entropy { self.acked_features = features; } - pub(crate) fn set_irq_status(&mut self, status: u32) { - self.irq_trigger.irq_status = Arc::new(AtomicU32::new(status)); - } - - pub(crate) fn set_activated(&mut self, mem: GuestMemoryMmap) { - self.device_state = DeviceState::Activated(mem); + pub(crate) fn set_activated(&mut self, mem: GuestMemoryMmap, interrupt: Arc) { + self.device_state = DeviceState::Activated(ActiveState { mem, interrupt }); } pub(crate) fn activate_event(&self) -> &EventFd { @@ -271,7 +263,11 @@ impl VirtioDevice for Entropy { } fn interrupt_trigger(&self) -> &IrqTrigger { - &self.irq_trigger + &self + .device_state + .active_state() + .expect("Device is not initialized") + .interrupt } fn avail_features(&self) -> u64 { @@ -294,7 +290,11 @@ impl VirtioDevice for Entropy { self.device_state.is_activated() } - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate( + &mut self, + mem: GuestMemoryMmap, + interrupt: Arc, + ) -> Result<(), ActivateError> { for q in self.queues.iter_mut() { q.initialize(&mem) .map_err(ActivateError::QueueMemoryError)?; @@ -304,7 +304,7 @@ impl VirtioDevice for Entropy { METRICS.activate_fails.inc(); ActivateError::EventFd })?; - self.device_state = DeviceState::Activated(mem); + self.device_state = DeviceState::Activated(ActiveState { mem, interrupt }); Ok(()) } } diff --git a/src/vmm/src/devices/virtio/rng/persist.rs b/src/vmm/src/devices/virtio/rng/persist.rs index 2f2519b4962..dd2d62debee 100644 --- a/src/vmm/src/devices/virtio/rng/persist.rs +++ b/src/vmm/src/devices/virtio/rng/persist.rs @@ -3,12 +3,15 @@ //! Defines the structures needed for saving/restoring entropy devices. +use std::sync::Arc; + use serde::{Deserialize, Serialize}; use crate::devices::virtio::TYPE_RNG; use crate::devices::virtio::persist::{PersistError as VirtioStateError, VirtioDeviceState}; use crate::devices::virtio::queue::FIRECRACKER_MAX_QUEUE_SIZE; use crate::devices::virtio::rng::{Entropy, EntropyError, RNG_NUM_QUEUES}; +use crate::devices::virtio::transport::mmio::IrqTrigger; use crate::rate_limiter::RateLimiter; use crate::rate_limiter::persist::RateLimiterState; use crate::snapshot::Persist; @@ -21,11 +24,14 @@ pub struct EntropyState { } #[derive(Debug)] -pub struct EntropyConstructorArgs(GuestMemoryMmap); +pub struct EntropyConstructorArgs { + mem: GuestMemoryMmap, + interrupt: Arc, +} impl EntropyConstructorArgs { - pub fn new(mem: GuestMemoryMmap) -> Self { - Self(mem) + pub fn new(mem: GuestMemoryMmap, interrupt: Arc) -> Self { + Self { mem, interrupt } } } @@ -56,7 +62,7 @@ impl Persist<'_> for Entropy { state: &Self::State, ) -> Result { let queues = state.virtio_state.build_queues_checked( - &constructor_args.0, + &constructor_args.mem, TYPE_RNG, RNG_NUM_QUEUES, FIRECRACKER_MAX_QUEUE_SIZE, @@ -66,9 +72,8 @@ impl Persist<'_> for Entropy { let mut entropy = Entropy::new_with_queues(queues, rate_limiter)?; entropy.set_avail_features(state.virtio_state.avail_features); entropy.set_acked_features(state.virtio_state.acked_features); - entropy.set_irq_status(state.virtio_state.interrupt_status); if state.virtio_state.activated { - entropy.set_activated(constructor_args.0); + entropy.set_activated(constructor_args.mem, constructor_args.interrupt); } Ok(entropy) @@ -77,11 +82,11 @@ impl Persist<'_> for Entropy { #[cfg(test)] mod tests { - use std::sync::atomic::Ordering; use super::*; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::rng::device::ENTROPY_DEV_ID; + use crate::devices::virtio::test_utils::default_interrupt; use crate::devices::virtio::test_utils::test::create_virtio_mem; use crate::snapshot::Snapshot; @@ -94,19 +99,16 @@ mod tests { let guest_mem = create_virtio_mem(); let restored = Entropy::restore( - EntropyConstructorArgs(guest_mem), + EntropyConstructorArgs::new(guest_mem, default_interrupt()), &Snapshot::deserialize(&mut mem.as_slice()).unwrap(), ) .unwrap(); assert_eq!(restored.device_type(), TYPE_RNG); assert_eq!(restored.id(), ENTROPY_DEV_ID); - assert_eq!(restored.is_activated(), entropy.is_activated()); + assert!(!restored.is_activated()); + assert!(!entropy.is_activated()); assert_eq!(restored.avail_features(), entropy.avail_features()); assert_eq!(restored.acked_features(), entropy.acked_features()); - assert_eq!( - restored.interrupt_status().load(Ordering::Relaxed), - entropy.interrupt_status().load(Ordering::Relaxed) - ); } } diff --git a/src/vmm/src/devices/virtio/test_utils.rs b/src/vmm/src/devices/virtio/test_utils.rs index 8642d0a85f4..29fbdc5ec56 100644 --- a/src/vmm/src/devices/virtio/test_utils.rs +++ b/src/vmm/src/devices/virtio/test_utils.rs @@ -6,6 +6,7 @@ use std::fmt::Debug; use std::marker::PhantomData; use std::mem; +use std::sync::Arc; use std::sync::atomic::{AtomicUsize, Ordering}; use crate::devices::virtio::queue::Queue; @@ -13,6 +14,8 @@ use crate::test_utils::single_region_mem; use crate::utils::{align_up, u64_to_usize}; use crate::vstate::memory::{Address, Bytes, GuestAddress, GuestMemoryMmap}; +use super::transport::mmio::IrqTrigger; + #[macro_export] macro_rules! check_metric_after_block { ($metric:expr, $delta:expr, $block:expr) => {{ @@ -28,6 +31,11 @@ pub fn default_mem() -> GuestMemoryMmap { single_region_mem(0x10000) } +/// Creates a default ['IrqTrigger'] interrupt for a VirtIO device. +pub fn default_interrupt() -> Arc { + Arc::new(IrqTrigger::new()) +} + #[derive(Debug)] pub struct InputData { pub data: Vec, @@ -323,7 +331,7 @@ pub(crate) mod test { use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::net::MAX_BUFFER_SIZE; use crate::devices::virtio::queue::{Queue, VIRTQ_DESC_F_NEXT}; - use crate::devices::virtio::test_utils::{VirtQueue, VirtqDesc}; + use crate::devices::virtio::test_utils::{VirtQueue, VirtqDesc, default_interrupt}; use crate::test_utils::single_region_mem; use crate::vstate::memory::{Address, GuestAddress, GuestMemoryMmap}; @@ -414,7 +422,12 @@ pub(crate) mod test { /// Activate the device pub fn activate_device(&mut self, mem: &'a GuestMemoryMmap) { - self.device.lock().unwrap().activate(mem.clone()).unwrap(); + let interrupt = default_interrupt(); + self.device + .lock() + .unwrap() + .activate(mem.clone(), interrupt) + .unwrap(); // Process the activate event let ev_count = self.event_manager.run_with_timeout(100).unwrap(); assert_eq!(ev_count, 1); diff --git a/src/vmm/src/devices/virtio/transport/mmio.rs b/src/vmm/src/devices/virtio/transport/mmio.rs index a763ff811dc..f1a8c8bfabf 100644 --- a/src/vmm/src/devices/virtio/transport/mmio.rs +++ b/src/vmm/src/devices/virtio/transport/mmio.rs @@ -57,7 +57,7 @@ pub struct MmioTransport { pub(crate) device_status: u32, pub(crate) config_generation: u32, mem: GuestMemoryMmap, - pub(crate) interrupt_status: Arc, + pub(crate) interrupt: Arc, pub is_vhost_user: bool, } @@ -65,11 +65,10 @@ impl MmioTransport { /// Constructs a new MMIO transport for the given virtio device. pub fn new( mem: GuestMemoryMmap, + interrupt: Arc, device: Arc>, is_vhost_user: bool, ) -> MmioTransport { - let interrupt_status = device.lock().expect("Poisoned lock").interrupt_status(); - MmioTransport { device, features_select: 0, @@ -78,7 +77,7 @@ impl MmioTransport { device_status: device_status::INIT, config_generation: 0, mem, - interrupt_status, + interrupt, is_vhost_user, } } @@ -146,7 +145,7 @@ impl MmioTransport { self.features_select = 0; self.acked_features_select = 0; self.queue_select = 0; - self.interrupt_status.store(0, Ordering::SeqCst); + self.interrupt.irq_status.store(0, Ordering::SeqCst); self.device_status = device_status::INIT; // . Keep interrupt_evt and queue_evts as is. There may be pending notifications in those // eventfds, but nothing will happen other than supurious wakeups. @@ -182,7 +181,9 @@ impl MmioTransport { let device_activated = self.locked_device().is_activated(); if !device_activated { // temporary variable needed for borrow checker - let activate_result = self.locked_device().activate(self.mem.clone()); + let activate_result = self + .locked_device() + .activate(self.mem.clone(), self.interrupt.clone()); if let Err(err) = activate_result { self.device_status |= DEVICE_NEEDS_RESET; @@ -265,7 +266,7 @@ impl MmioTransport { // `VIRTIO_MMIO_INT_CONFIG` or not to understand if we need to send // `VIRTIO_MMIO_INT_CONFIG` or // `VIRTIO_MMIO_INT_VRING`. - let is = self.interrupt_status.load(Ordering::SeqCst); + let is = self.interrupt.irq_status.load(Ordering::SeqCst); if !self.is_vhost_user { is } else if is == VIRTIO_MMIO_INT_CONFIG { @@ -326,7 +327,7 @@ impl MmioTransport { 0x44 => self.update_queue_field(|q| q.ready = v == 1), 0x64 => { if self.check_device_status(device_status::DRIVER_OK, 0) { - self.interrupt_status.fetch_and(!v, Ordering::SeqCst); + self.interrupt.irq_status.fetch_and(!v, Ordering::SeqCst); } } 0x70 => self.set_device_status(v), @@ -407,6 +408,7 @@ impl IrqTrigger { #[cfg(test)] pub(crate) mod tests { + use vmm_sys_util::eventfd::EventFd; use super::*; @@ -421,7 +423,7 @@ pub(crate) mod tests { pub(crate) struct DummyDevice { acked_features: u64, avail_features: u64, - interrupt_trigger: IrqTrigger, + interrupt_trigger: Option>, queue_evts: Vec, queues: Vec, device_activated: bool, @@ -434,7 +436,7 @@ pub(crate) mod tests { DummyDevice { acked_features: 0, avail_features: 0, - interrupt_trigger: IrqTrigger::new(), + interrupt_trigger: None, queue_evts: vec![ EventFd::new(libc::EFD_NONBLOCK).unwrap(), EventFd::new(libc::EFD_NONBLOCK).unwrap(), @@ -481,7 +483,9 @@ pub(crate) mod tests { } fn interrupt_trigger(&self) -> &IrqTrigger { - &self.interrupt_trigger + self.interrupt_trigger + .as_ref() + .expect("Device is not activated") } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -494,8 +498,13 @@ pub(crate) mod tests { } } - fn activate(&mut self, _: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate( + &mut self, + _: GuestMemoryMmap, + interrupt: Arc, + ) -> Result<(), ActivateError> { self.device_activated = true; + self.interrupt_trigger = Some(interrupt); if self.activate_should_error { Err(ActivateError::EventFd) } else { @@ -517,10 +526,11 @@ pub(crate) mod tests { #[test] fn test_new() { let m = single_region_mem(0x1000); + let interrupt = Arc::new(IrqTrigger::new()); let mut dummy = DummyDevice::new(); // Validate reset is no-op. assert!(dummy.reset().is_none()); - let mut d = MmioTransport::new(m, Arc::new(Mutex::new(dummy)), false); + let mut d = MmioTransport::new(m, interrupt, Arc::new(Mutex::new(dummy)), false); // We just make sure here that the implementation of a mmio device behaves as we expect, // given a known virtio device implementation (the dummy device). @@ -545,7 +555,13 @@ pub(crate) mod tests { #[test] fn test_bus_device_read() { let m = single_region_mem(0x1000); - let mut d = MmioTransport::new(m, Arc::new(Mutex::new(DummyDevice::new())), false); + let interrupt = Arc::new(IrqTrigger::new()); + let mut d = MmioTransport::new( + m, + interrupt, + Arc::new(Mutex::new(DummyDevice::new())), + false, + ); let mut buf = vec![0xff, 0, 0xfe, 0]; let buf_copy = buf.to_vec(); @@ -592,17 +608,18 @@ pub(crate) mod tests { d.bus_read(0x44, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), u32::from(false)); - d.interrupt_status.store(111, Ordering::SeqCst); + d.interrupt.irq_status.store(111, Ordering::SeqCst); d.bus_read(0x60, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), 111); d.is_vhost_user = true; - d.interrupt_status.store(0, Ordering::SeqCst); + d.interrupt.irq_status.store(0, Ordering::SeqCst); d.bus_read(0x60, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), VIRTIO_MMIO_INT_VRING); d.is_vhost_user = true; - d.interrupt_status + d.interrupt + .irq_status .store(VIRTIO_MMIO_INT_CONFIG, Ordering::SeqCst); d.bus_read(0x60, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), VIRTIO_MMIO_INT_CONFIG); @@ -634,8 +651,9 @@ pub(crate) mod tests { #[allow(clippy::cognitive_complexity)] fn test_bus_device_write() { let m = single_region_mem(0x1000); + let interrupt = Arc::new(IrqTrigger::new()); let dummy_dev = Arc::new(Mutex::new(DummyDevice::new())); - let mut d = MmioTransport::new(m, dummy_dev.clone(), false); + let mut d = MmioTransport::new(m, interrupt, dummy_dev.clone(), false); let mut buf = vec![0; 5]; write_le_u32(&mut buf[..4], 1); @@ -762,10 +780,10 @@ pub(crate) mod tests { | device_status::DRIVER_OK, ); - d.interrupt_status.store(0b10_1010, Ordering::Relaxed); + d.interrupt.irq_status.store(0b10_1010, Ordering::Relaxed); write_le_u32(&mut buf[..], 0b111); d.bus_write(0x64, &buf[..]); - assert_eq!(d.interrupt_status.load(Ordering::Relaxed), 0b10_1000); + assert_eq!(d.interrupt.irq_status.load(Ordering::Relaxed), 0b10_1000); // Write to an invalid address in generic register range. write_le_u32(&mut buf[..], 0xf); @@ -796,7 +814,13 @@ pub(crate) mod tests { #[test] fn test_bus_device_activate() { let m = single_region_mem(0x1000); - let mut d = MmioTransport::new(m, Arc::new(Mutex::new(DummyDevice::new())), false); + let interrupt = Arc::new(IrqTrigger::new()); + let mut d = MmioTransport::new( + m, + interrupt, + Arc::new(Mutex::new(DummyDevice::new())), + false, + ); assert!(!d.locked_device().is_activated()); assert_eq!(d.device_status, device_status::INIT); @@ -873,11 +897,12 @@ pub(crate) mod tests { #[test] fn test_bus_device_activate_failure() { let m = single_region_mem(0x1000); + let interrupt = Arc::new(IrqTrigger::new()); let device = DummyDevice { activate_should_error: true, ..DummyDevice::new() }; - let mut d = MmioTransport::new(m, Arc::new(Mutex::new(device)), false); + let mut d = MmioTransport::new(m, interrupt, Arc::new(Mutex::new(device)), false); set_device_status(&mut d, device_status::ACKNOWLEDGE); set_device_status(&mut d, device_status::ACKNOWLEDGE | device_status::DRIVER); @@ -895,10 +920,7 @@ pub(crate) mod tests { write_le_u32(&mut buf[..], 1); d.bus_write(0x44, &buf[..]); } - assert_eq!( - d.locked_device().interrupt_status().load(Ordering::SeqCst), - 0 - ); + assert!(!d.locked_device().is_activated()); set_device_status( &mut d, @@ -967,7 +989,13 @@ pub(crate) mod tests { #[test] fn test_bus_device_reset() { let m = single_region_mem(0x1000); - let mut d = MmioTransport::new(m, Arc::new(Mutex::new(DummyDevice::new())), false); + let interrupt = Arc::new(IrqTrigger::new()); + let mut d = MmioTransport::new( + m, + interrupt, + Arc::new(Mutex::new(DummyDevice::new())), + false, + ); let mut buf = [0; 4]; assert!(!d.locked_device().is_activated()); diff --git a/src/vmm/src/devices/virtio/vsock/device.rs b/src/vmm/src/devices/virtio/vsock/device.rs index fc51a61532c..ad049b517e4 100644 --- a/src/vmm/src/devices/virtio/vsock/device.rs +++ b/src/vmm/src/devices/virtio/vsock/device.rs @@ -21,6 +21,7 @@ //! - a backend FD. use std::fmt::Debug; +use std::sync::Arc; use log::{error, warn}; use vmm_sys_util::eventfd::EventFd; @@ -30,7 +31,7 @@ use super::defs::uapi; use super::packet::{VSOCK_PKT_HDR_SIZE, VsockPacketRx, VsockPacketTx}; use super::{VsockBackend, defs}; use crate::devices::virtio::ActivateError; -use crate::devices::virtio::device::{DeviceState, VirtioDevice}; +use crate::devices::virtio::device::{ActiveState, DeviceState, VirtioDevice}; use crate::devices::virtio::generated::virtio_config::{VIRTIO_F_IN_ORDER, VIRTIO_F_VERSION_1}; use crate::devices::virtio::queue::{InvalidAvailIdx, Queue as VirtQueue}; use crate::devices::virtio::transport::mmio::{IrqTrigger, IrqType}; @@ -62,7 +63,6 @@ pub struct Vsock { pub(crate) backend: B, pub(crate) avail_features: u64, pub(crate) acked_features: u64, - pub(crate) irq_trigger: IrqTrigger, // This EventFd is the only one initially registered for a vsock device, and is used to convert // a VirtioDevice::activate call into an EventHandler read event which allows the other events // (queue and backend related) to be registered post virtio device activation. That's @@ -103,7 +103,6 @@ where backend, avail_features: AVAIL_FEATURES, acked_features: 0, - irq_trigger: IrqTrigger::new(), activate_evt: EventFd::new(libc::EFD_NONBLOCK).map_err(VsockError::EventFd)?, device_state: DeviceState::Inactive, rx_packet: VsockPacketRx::new()?, @@ -138,7 +137,10 @@ where /// Signal the guest driver that we've used some virtio buffers that it had previously made /// available. pub fn signal_used_queue(&self) -> Result<(), DeviceError> { - self.irq_trigger + self.device_state + .active_state() + .expect("Device is not initialized") + .interrupt .trigger_irq(IrqType::Vring) .map_err(DeviceError::FailedSignalingIrq) } @@ -148,7 +150,7 @@ where /// otherwise. pub fn process_rx(&mut self) -> Result { // This is safe since we checked in the event handler that the device is activated. - let mem = self.device_state.mem().unwrap(); + let mem = &self.device_state.active_state().unwrap().mem; let queue = &mut self.queues[RXQ_INDEX]; let mut have_used = false; @@ -201,7 +203,7 @@ where /// ring, and `false` otherwise. pub fn process_tx(&mut self) -> Result { // This is safe since we checked in the event handler that the device is activated. - let mem = self.device_state.mem().unwrap(); + let mem = &self.device_state.active_state().unwrap().mem; let queue = &mut self.queues[TXQ_INDEX]; let mut have_used = false; @@ -241,7 +243,7 @@ where // remain but their CID is updated to reflect the current guest_cid. pub fn send_transport_reset_event(&mut self) -> Result<(), DeviceError> { // This is safe since we checked in the caller function that the device is activated. - let mem = self.device_state.mem().unwrap(); + let mem = &self.device_state.active_state().unwrap().mem; let queue = &mut self.queues[EVQ_INDEX]; let head = queue.pop()?.ok_or_else(|| { @@ -296,7 +298,11 @@ where } fn interrupt_trigger(&self) -> &IrqTrigger { - &self.irq_trigger + &self + .device_state + .active_state() + .expect("Device is not initialized") + .interrupt } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -328,7 +334,11 @@ where ); } - fn activate(&mut self, mem: GuestMemoryMmap) -> Result<(), ActivateError> { + fn activate( + &mut self, + mem: GuestMemoryMmap, + interrupt: Arc, + ) -> Result<(), ActivateError> { for q in self.queues.iter_mut() { q.initialize(&mem) .map_err(ActivateError::QueueMemoryError)?; @@ -347,7 +357,7 @@ where return Err(ActivateError::EventFd); } - self.device_state = DeviceState::Activated(mem); + self.device_state = DeviceState::Activated(ActiveState { mem, interrupt }); Ok(()) } @@ -430,6 +440,8 @@ mod tests { // } // Test a correct activation. - ctx.device.activate(ctx.mem.clone()).unwrap(); + ctx.device + .activate(ctx.mem.clone(), ctx.interrupt.clone()) + .unwrap(); } } diff --git a/src/vmm/src/devices/virtio/vsock/event_handler.rs b/src/vmm/src/devices/virtio/vsock/event_handler.rs index 59fbd3eaa3d..9c909048a69 100755 --- a/src/vmm/src/devices/virtio/vsock/event_handler.rs +++ b/src/vmm/src/devices/virtio/vsock/event_handler.rs @@ -240,7 +240,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); ctx.device.backend.set_pending_rx(false); ctx.signal_txq_event(); @@ -257,7 +257,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); ctx.device.backend.set_pending_rx(true); ctx.signal_txq_event(); @@ -273,7 +273,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); ctx.device.backend.set_pending_rx(false); ctx.device.backend.set_tx_err(Some(VsockError::NoData)); @@ -289,7 +289,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); // Invalidate the descriptor chain, by setting its length to 0. ctx.guest_txvq.dtable[0].len.set(0); @@ -306,7 +306,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); assert!(!ctx.device.handle_txq_event(EventSet::IN)); } @@ -321,7 +321,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); ctx.device.backend.set_pending_rx(true); ctx.device.backend.set_rx_err(Some(VsockError::NoData)); @@ -338,7 +338,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); ctx.device.backend.set_pending_rx(true); ctx.signal_rxq_event(); @@ -351,7 +351,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); // Invalidate the descriptor chain, by setting its length to 0. ctx.guest_rxvq.dtable[0].len.set(0); @@ -367,7 +367,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); ctx.device.backend.set_pending_rx(false); assert!(!ctx.device.handle_rxq_event(EventSet::IN)); } @@ -392,7 +392,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); ctx.device.backend.set_pending_rx(true); ctx.device.notify_backend(EventSet::IN).unwrap(); @@ -411,7 +411,7 @@ mod tests { { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); - ctx.mock_activate(test_ctx.mem.clone()); + ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); ctx.device.backend.set_pending_rx(false); ctx.device.notify_backend(EventSet::IN).unwrap(); @@ -454,7 +454,7 @@ mod tests { { let mut ctx = test_ctx.create_event_handler_context(); - // When modifiyng the buffer descriptor, make sure the len field is altered in the + // When modifying the buffer descriptor, make sure the len field is altered in the // vsock packet header descriptor as well. if desc_idx == 1 { // The vsock packet len field has offset 24 in the header. @@ -582,7 +582,7 @@ mod tests { vsock .lock() .unwrap() - .activate(test_ctx.mem.clone()) + .activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()) .unwrap(); // Process the activate event. let ev_count = event_manager.run_with_timeout(50).unwrap(); diff --git a/src/vmm/src/devices/virtio/vsock/persist.rs b/src/vmm/src/devices/virtio/vsock/persist.rs index fce6affae69..3d0967926be 100644 --- a/src/vmm/src/devices/virtio/vsock/persist.rs +++ b/src/vmm/src/devices/virtio/vsock/persist.rs @@ -5,14 +5,14 @@ use std::fmt::Debug; use std::sync::Arc; -use std::sync::atomic::AtomicU32; use serde::{Deserialize, Serialize}; use super::*; -use crate::devices::virtio::device::DeviceState; +use crate::devices::virtio::device::{ActiveState, DeviceState}; use crate::devices::virtio::persist::VirtioDeviceState; use crate::devices::virtio::queue::FIRECRACKER_MAX_QUEUE_SIZE; +use crate::devices::virtio::transport::mmio::IrqTrigger; use crate::devices::virtio::vsock::TYPE_VSOCK; use crate::snapshot::Persist; use crate::vstate::memory::GuestMemoryMmap; @@ -29,7 +29,7 @@ pub struct VsockState { /// The Vsock frontend serializable state. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct VsockFrontendState { - /// Context IDentifier. + /// Context Identifier. pub cid: u64, virtio_state: VirtioDeviceState, } @@ -53,6 +53,8 @@ pub struct VsockUdsState { pub struct VsockConstructorArgs { /// Pointer to guest memory. pub mem: GuestMemoryMmap, + /// Interrupt to use for the device. + pub interrupt: Arc, /// The vsock Unix Backend. pub backend: B, } @@ -121,10 +123,11 @@ where vsock.acked_features = state.virtio_state.acked_features; vsock.avail_features = state.virtio_state.avail_features; - vsock.irq_trigger.irq_status = - Arc::new(AtomicU32::new(state.virtio_state.interrupt_status)); vsock.device_state = if state.virtio_state.activated { - DeviceState::Activated(constructor_args.mem) + DeviceState::Activated(ActiveState { + mem: constructor_args.mem, + interrupt: constructor_args.interrupt, + }) } else { DeviceState::Inactive }; @@ -137,6 +140,7 @@ pub(crate) mod tests { use super::device::AVAIL_FEATURES; use super::*; use crate::devices::virtio::device::VirtioDevice; + use crate::devices::virtio::test_utils::default_interrupt; use crate::devices::virtio::vsock::defs::uapi; use crate::devices::virtio::vsock::test_utils::{TestBackend, TestContext}; use crate::snapshot::Snapshot; @@ -189,6 +193,7 @@ pub(crate) mod tests { let mut restored_device = Vsock::restore( VsockConstructorArgs { mem: ctx.mem.clone(), + interrupt: default_interrupt(), backend: match restored_state.backend { VsockBackendState::Uds(uds_state) => { assert_eq!(uds_state.path, "test".to_owned()); diff --git a/src/vmm/src/devices/virtio/vsock/test_utils.rs b/src/vmm/src/devices/virtio/vsock/test_utils.rs index 921c2e79bdb..56795e5fd36 100644 --- a/src/vmm/src/devices/virtio/vsock/test_utils.rs +++ b/src/vmm/src/devices/virtio/vsock/test_utils.rs @@ -5,6 +5,7 @@ #![doc(hidden)] use std::os::unix::io::{AsRawFd, RawFd}; +use std::sync::Arc; use vmm_sys_util::epoll::EventSet; use vmm_sys_util::eventfd::EventFd; @@ -12,7 +13,8 @@ use vmm_sys_util::eventfd::EventFd; use super::packet::{VsockPacketRx, VsockPacketTx}; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::queue::{VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; -use crate::devices::virtio::test_utils::VirtQueue as GuestQ; +use crate::devices::virtio::test_utils::{VirtQueue as GuestQ, default_interrupt}; +use crate::devices::virtio::transport::mmio::IrqTrigger; use crate::devices::virtio::vsock::device::{RXQ_INDEX, TXQ_INDEX}; use crate::devices::virtio::vsock::packet::VSOCK_PKT_HDR_SIZE; use crate::devices::virtio::vsock::{ @@ -117,6 +119,7 @@ impl VsockBackend for TestBackend {} pub struct TestContext { pub cid: u64, pub mem: GuestMemoryMmap, + pub interrupt: Arc, pub mem_size: usize, pub device: Vsock, } @@ -134,6 +137,7 @@ impl TestContext { Self { cid: CID, mem, + interrupt: default_interrupt(), mem_size: MEM_SIZE, device, } @@ -196,9 +200,9 @@ pub struct EventHandlerContext<'a> { } impl EventHandlerContext<'_> { - pub fn mock_activate(&mut self, mem: GuestMemoryMmap) { + pub fn mock_activate(&mut self, mem: GuestMemoryMmap, interrupt: Arc) { // Artificially activate the device. - self.device.activate(mem).unwrap(); + self.device.activate(mem, interrupt).unwrap(); } pub fn signal_txq_event(&mut self) { From 46937af68a430737101930bb627c212dd942bf49 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 16 Apr 2025 15:10:31 +0200 Subject: [PATCH 04/56] virtio: add generic interrupt trait Describing the APIs that need to implement types that are used as interrupts for VirtIO devices. Currently, we only use `IrqInterrupt` interrupts, but this will change once we have MSI-X with PCIe devices. Signed-off-by: Babis Chalios --- src/vmm/src/devices/virtio/transport/mmio.rs | 83 +++++++++++++------- src/vmm/src/devices/virtio/transport/mod.rs | 32 ++++++++ 2 files changed, 88 insertions(+), 27 deletions(-) diff --git a/src/vmm/src/devices/virtio/transport/mmio.rs b/src/vmm/src/devices/virtio/transport/mmio.rs index f1a8c8bfabf..9f72e3fa75a 100644 --- a/src/vmm/src/devices/virtio/transport/mmio.rs +++ b/src/vmm/src/devices/virtio/transport/mmio.rs @@ -11,6 +11,7 @@ use std::sync::{Arc, Mutex, MutexGuard}; use vmm_sys_util::eventfd::EventFd; +use super::{VirtioInterrupt, VirtioInterruptType}; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::device_status; use crate::devices::virtio::queue::Queue; @@ -368,6 +369,15 @@ pub enum IrqType { Vring, } +impl From for IrqType { + fn from(interrupt_type: VirtioInterruptType) -> Self { + match interrupt_type { + VirtioInterruptType::Config => IrqType::Config, + VirtioInterruptType::Queue(_) => IrqType::Vring, + } + } +} + /// Helper struct that is responsible for triggering guest IRQs #[derive(Debug)] pub struct IrqTrigger { @@ -381,6 +391,40 @@ impl Default for IrqTrigger { } } +impl VirtioInterrupt for IrqTrigger { + fn trigger(&self, interrupt_type: VirtioInterruptType) -> Result<(), std::io::Error> { + match interrupt_type { + VirtioInterruptType::Config => self.trigger_irq(IrqType::Config), + VirtioInterruptType::Queue(_) => self.trigger_irq(IrqType::Vring), + } + } + + fn notifier(&self, _interrupt_type: VirtioInterruptType) -> Option<&EventFd> { + Some(&self.irq_evt) + } + + fn status(&self) -> Arc { + self.irq_status.clone() + } + + #[cfg(test)] + fn has_pending_interrupt(&self, interrupt_type: VirtioInterruptType) -> bool { + if let Ok(num_irqs) = self.irq_evt.read() { + if num_irqs == 0 { + return false; + } + + let irq_status = self.irq_status.load(Ordering::SeqCst); + return matches!( + (irq_status, interrupt_type.into()), + (VIRTIO_MMIO_INT_CONFIG, IrqType::Config) | (VIRTIO_MMIO_INT_VRING, IrqType::Vring) + ); + } + + false + } +} + impl IrqTrigger { pub fn new() -> Self { Self { @@ -1045,44 +1089,29 @@ pub(crate) mod tests { assert_eq!(dummy_dev.acked_features(), 24); } - impl IrqTrigger { - pub fn has_pending_irq(&self, irq_type: IrqType) -> bool { - if let Ok(num_irqs) = self.irq_evt.read() { - if num_irqs == 0 { - return false; - } - - let irq_status = self.irq_status.load(Ordering::SeqCst); - return matches!( - (irq_status, irq_type), - (VIRTIO_MMIO_INT_CONFIG, IrqType::Config) - | (VIRTIO_MMIO_INT_VRING, IrqType::Vring) - ); - } - - false - } - } - #[test] fn irq_trigger() { let irq_trigger = IrqTrigger::new(); assert_eq!(irq_trigger.irq_status.load(Ordering::SeqCst), 0); // Check that there are no pending irqs. - assert!(!irq_trigger.has_pending_irq(IrqType::Config)); - assert!(!irq_trigger.has_pending_irq(IrqType::Vring)); + assert!(!irq_trigger.has_pending_interrupt(VirtioInterruptType::Config)); + assert!(!irq_trigger.has_pending_interrupt(VirtioInterruptType::Queue(0))); // Check that trigger_irq() correctly generates irqs. - irq_trigger.trigger_irq(IrqType::Config).unwrap(); - assert!(irq_trigger.has_pending_irq(IrqType::Config)); + irq_trigger.trigger(VirtioInterruptType::Config).unwrap(); + assert!(irq_trigger.has_pending_interrupt(VirtioInterruptType::Config)); irq_trigger.irq_status.store(0, Ordering::SeqCst); - irq_trigger.trigger_irq(IrqType::Vring).unwrap(); - assert!(irq_trigger.has_pending_irq(IrqType::Vring)); + irq_trigger.trigger(VirtioInterruptType::Queue(0)).unwrap(); + assert!(irq_trigger.has_pending_interrupt(VirtioInterruptType::Queue(0))); // Check trigger_irq() failure case (irq_evt is full). irq_trigger.irq_evt.write(u64::MAX - 1).unwrap(); - irq_trigger.trigger_irq(IrqType::Config).unwrap_err(); - irq_trigger.trigger_irq(IrqType::Vring).unwrap_err(); + irq_trigger + .trigger(VirtioInterruptType::Config) + .unwrap_err(); + irq_trigger + .trigger(VirtioInterruptType::Queue(0)) + .unwrap_err(); } } diff --git a/src/vmm/src/devices/virtio/transport/mod.rs b/src/vmm/src/devices/virtio/transport/mod.rs index 1ff8229a1c8..d41ad943aa2 100644 --- a/src/vmm/src/devices/virtio/transport/mod.rs +++ b/src/vmm/src/devices/virtio/transport/mod.rs @@ -1,5 +1,37 @@ // Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +use std::sync::Arc; +use std::sync::atomic::AtomicU32; + +use vmm_sys_util::eventfd::EventFd; + /// MMIO transport for VirtIO devices pub mod mmio; + +/// Represents the types of interrupts used by VirtIO devices +#[derive(Debug, Clone)] +pub enum VirtioInterruptType { + /// Interrupt for VirtIO configuration changes + Config, + /// Interrupts for new events in a queue. + Queue(u16), +} + +/// API of interrupt types used by VirtIO devices +pub trait VirtioInterrupt: std::fmt::Debug + Send + Sync { + /// Trigger a VirtIO interrupt. + fn trigger(&self, interrupt_type: VirtioInterruptType) -> Result<(), std::io::Error>; + + /// Get the `EventFd` (if any) that backs the underlying interrupt. + fn notifier(&self, _interrupt_type: VirtioInterruptType) -> Option<&EventFd> { + None + } + + /// Get the current device interrupt status. + fn status(&self) -> Arc; + + /// Returns true if there is any pending interrupt + #[cfg(test)] + fn has_pending_interrupt(&self, interrupt_type: VirtioInterruptType) -> bool; +} From 2a6ca422f3618ea57e960eeaf467573aa83e4815 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 22 Apr 2025 10:01:51 +0200 Subject: [PATCH 05/56] refactor: use VirtioInterrupt in VirtIO devices VirtIO devices assume they're operating under an MMIO transport and as a consequence they use IrqTrigger as interrupts. Switch that to using VirtioInterrupt for all VirtIO device objects. Only assume a VirtioInterrupt is an IrqTrigger in MMIO specific code. Signed-off-by: Babis Chalios --- src/vmm/src/device_manager/mmio.rs | 12 +- src/vmm/src/devices/virtio/balloon/device.rs | 30 ++-- src/vmm/src/devices/virtio/balloon/persist.rs | 4 +- .../src/devices/virtio/balloon/test_utils.rs | 10 +- src/vmm/src/devices/virtio/block/device.rs | 6 +- src/vmm/src/devices/virtio/block/persist.rs | 4 +- .../devices/virtio/block/vhost_user/device.rs | 20 +-- .../devices/virtio/block/vhost_user/mod.rs | 2 +- .../src/devices/virtio/block/virtio/device.rs | 25 ++-- .../src/devices/virtio/block/virtio/mod.rs | 4 +- .../devices/virtio/block/virtio/test_utils.rs | 9 +- src/vmm/src/devices/virtio/device.rs | 14 +- src/vmm/src/devices/virtio/net/device.rs | 136 ++++++++++++++---- src/vmm/src/devices/virtio/net/persist.rs | 4 +- src/vmm/src/devices/virtio/net/test_utils.rs | 6 +- src/vmm/src/devices/virtio/rng/device.rs | 19 ++- src/vmm/src/devices/virtio/rng/persist.rs | 6 +- src/vmm/src/devices/virtio/test_utils.rs | 6 +- src/vmm/src/devices/virtio/transport/mmio.rs | 16 ++- src/vmm/src/devices/virtio/vhost_user.rs | 28 +++- src/vmm/src/devices/virtio/vsock/device.rs | 19 +-- .../src/devices/virtio/vsock/event_handler.rs | 5 +- src/vmm/src/devices/virtio/vsock/persist.rs | 4 +- .../src/devices/virtio/vsock/test_utils.rs | 6 +- 24 files changed, 258 insertions(+), 137 deletions(-) diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 992d3b70d61..2c7bdb0d679 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -505,7 +505,7 @@ impl MMIODeviceManager { .unwrap(); if vsock.is_activated() { info!("kick vsock {id}."); - vsock.signal_used_queue().unwrap(); + vsock.signal_used_queue(0).unwrap(); } } TYPE_RNG => { @@ -525,6 +525,7 @@ impl MMIODeviceManager { #[cfg(test)] mod tests { + use std::ops::Deref; use std::sync::Arc; use vmm_sys_util::eventfd::EventFd; @@ -534,6 +535,7 @@ mod tests { use crate::devices::virtio::ActivateError; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::queue::Queue; + use crate::devices::virtio::transport::VirtioInterrupt; use crate::devices::virtio::transport::mmio::IrqTrigger; use crate::test_utils::multi_region_mem_raw; use crate::vstate::kvm::Kvm; @@ -620,10 +622,8 @@ mod tests { &self.queue_evts } - fn interrupt_trigger(&self) -> &IrqTrigger { - self.interrupt_trigger - .as_ref() - .expect("Device is not activated") + fn interrupt_trigger(&self) -> &dyn VirtioInterrupt { + self.interrupt_trigger.as_ref().unwrap().deref() } fn ack_features_by_page(&mut self, page: u32, value: u32) { @@ -644,7 +644,7 @@ mod tests { fn activate( &mut self, _: GuestMemoryMmap, - _: Arc, + _: Arc, ) -> Result<(), ActivateError> { Ok(()) } diff --git a/src/vmm/src/devices/virtio/balloon/device.rs b/src/vmm/src/devices/virtio/balloon/device.rs index e07d287c029..8b06e8ea38f 100644 --- a/src/vmm/src/devices/virtio/balloon/device.rs +++ b/src/vmm/src/devices/virtio/balloon/device.rs @@ -1,6 +1,7 @@ // Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +use std::ops::Deref; use std::sync::Arc; use std::time::Duration; @@ -27,7 +28,7 @@ use crate::devices::virtio::balloon::BalloonError; use crate::devices::virtio::device::ActiveState; use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; use crate::devices::virtio::queue::InvalidAvailIdx; -use crate::devices::virtio::transport::mmio::{IrqTrigger, IrqType}; +use crate::devices::virtio::transport::{VirtioInterrupt, VirtioInterruptType}; use crate::logger::IncMetric; use crate::utils::u64_to_usize; use crate::vstate::memory::{Address, ByteValued, Bytes, GuestAddress, GuestMemoryMmap}; @@ -342,7 +343,7 @@ impl Balloon { queue.advance_used_ring_idx(); if needs_interrupt { - self.signal_used_queue()?; + self.signal_used_queue(INFLATE_INDEX)?; } Ok(()) @@ -361,7 +362,7 @@ impl Balloon { queue.advance_used_ring_idx(); if needs_interrupt { - self.signal_used_queue() + self.signal_used_queue(DEFLATE_INDEX) } else { Ok(()) } @@ -403,9 +404,12 @@ impl Balloon { Ok(()) } - pub(crate) fn signal_used_queue(&self) -> Result<(), BalloonError> { + pub(crate) fn signal_used_queue(&self, qidx: usize) -> Result<(), BalloonError> { self.interrupt_trigger() - .trigger_irq(IrqType::Vring) + .trigger(VirtioInterruptType::Queue( + qidx.try_into() + .unwrap_or_else(|_| panic!("balloon: invalid queue id: {qidx}")), + )) .map_err(|err| { METRICS.event_fails.inc(); BalloonError::InterruptError(err) @@ -435,7 +439,7 @@ impl Balloon { if let Some(index) = self.stats_desc_index.take() { self.queues[STATS_INDEX].add_used(index, 0)?; self.queues[STATS_INDEX].advance_used_ring_idx(); - self.signal_used_queue() + self.signal_used_queue(STATS_INDEX) } else { error!("Failed to update balloon stats, missing descriptor."); Ok(()) @@ -447,7 +451,7 @@ impl Balloon { if self.is_activated() { self.config_space.num_pages = mib_to_pages(amount_mib)?; self.interrupt_trigger() - .trigger_irq(IrqType::Config) + .trigger(VirtioInterruptType::Config) .map_err(BalloonError::InterruptError) } else { Err(BalloonError::DeviceNotActive) @@ -558,12 +562,12 @@ impl VirtioDevice for Balloon { &self.queue_evts } - fn interrupt_trigger(&self) -> &IrqTrigger { - &self - .device_state + fn interrupt_trigger(&self) -> &dyn VirtioInterrupt { + self.device_state .active_state() .expect("Device is not activated") .interrupt + .deref() } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -593,7 +597,7 @@ impl VirtioDevice for Balloon { fn activate( &mut self, mem: GuestMemoryMmap, - interrupt: Arc, + interrupt: Arc, ) -> Result<(), ActivateError> { for q in self.queues.iter_mut() { q.initialize(&mem) @@ -1070,7 +1074,9 @@ pub(crate) mod tests { assert!(balloon.stats_desc_index.is_some()); balloon.process_stats_timer_event().unwrap(); assert!(balloon.stats_desc_index.is_none()); - assert!(balloon.interrupt_trigger().has_pending_irq(IrqType::Vring)); + assert!(balloon.interrupt_trigger().has_pending_interrupt( + VirtioInterruptType::Queue(STATS_INDEX.try_into().unwrap()) + )); }); } } diff --git a/src/vmm/src/devices/virtio/balloon/persist.rs b/src/vmm/src/devices/virtio/balloon/persist.rs index 397dd8aeb3e..a6634d07170 100644 --- a/src/vmm/src/devices/virtio/balloon/persist.rs +++ b/src/vmm/src/devices/virtio/balloon/persist.rs @@ -15,7 +15,7 @@ use crate::devices::virtio::balloon::device::{BalloonStats, ConfigSpace}; use crate::devices::virtio::device::{ActiveState, DeviceState}; use crate::devices::virtio::persist::VirtioDeviceState; use crate::devices::virtio::queue::FIRECRACKER_MAX_QUEUE_SIZE; -use crate::devices::virtio::transport::mmio::IrqTrigger; +use crate::devices::virtio::transport::VirtioInterrupt; use crate::snapshot::Persist; use crate::vstate::memory::GuestMemoryMmap; @@ -96,7 +96,7 @@ pub struct BalloonConstructorArgs { /// Pointer to guest memory. pub mem: GuestMemoryMmap, /// Interrupt used from the device. - pub interrupt: Arc, + pub interrupt: Arc, pub restored_from_file: bool, } diff --git a/src/vmm/src/devices/virtio/balloon/test_utils.rs b/src/vmm/src/devices/virtio/balloon/test_utils.rs index e588abaedee..2665d5dbd87 100644 --- a/src/vmm/src/devices/virtio/balloon/test_utils.rs +++ b/src/vmm/src/devices/virtio/balloon/test_utils.rs @@ -3,6 +3,8 @@ #![doc(hidden)] +#[cfg(test)] +use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::test_utils::VirtQueue; #[cfg(test)] use crate::devices::virtio::{balloon::BALLOON_NUM_QUEUES, balloon::Balloon}; @@ -10,8 +12,7 @@ use crate::devices::virtio::{balloon::BALLOON_NUM_QUEUES, balloon::Balloon}; #[cfg(test)] pub fn invoke_handler_for_queue_event(b: &mut Balloon, queue_index: usize) { use crate::devices::virtio::balloon::{DEFLATE_INDEX, INFLATE_INDEX, STATS_INDEX}; - use crate::devices::virtio::device::VirtioDevice; - use crate::devices::virtio::transport::mmio::IrqType; + use crate::devices::virtio::transport::VirtioInterruptType; assert!(queue_index < BALLOON_NUM_QUEUES); // Trigger the queue event. @@ -25,7 +26,10 @@ pub fn invoke_handler_for_queue_event(b: &mut Balloon, queue_index: usize) { }; // Validate the queue operation finished successfully. let interrupt = b.interrupt_trigger(); - assert!(interrupt.has_pending_irq(IrqType::Vring)); + assert!( + interrupt + .has_pending_interrupt(VirtioInterruptType::Queue(queue_index.try_into().unwrap())) + ); } pub fn set_request(queue: &VirtQueue, idx: u16, addr: u64, len: u32, flags: u16) { diff --git a/src/vmm/src/devices/virtio/block/device.rs b/src/vmm/src/devices/virtio/block/device.rs index 5a491c537c5..d58550acc59 100644 --- a/src/vmm/src/devices/virtio/block/device.rs +++ b/src/vmm/src/devices/virtio/block/device.rs @@ -12,7 +12,7 @@ use super::vhost_user::device::{VhostUserBlock, VhostUserBlockConfig}; use super::virtio::device::{VirtioBlock, VirtioBlockConfig}; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::queue::{InvalidAvailIdx, Queue}; -use crate::devices::virtio::transport::mmio::IrqTrigger; +use crate::devices::virtio::transport::VirtioInterrupt; use crate::devices::virtio::{ActivateError, TYPE_BLOCK}; use crate::rate_limiter::BucketUpdate; use crate::snapshot::Persist; @@ -176,7 +176,7 @@ impl VirtioDevice for Block { } } - fn interrupt_trigger(&self) -> &IrqTrigger { + fn interrupt_trigger(&self) -> &dyn VirtioInterrupt { match self { Self::Virtio(b) => b.interrupt_trigger(), Self::VhostUser(b) => b.interrupt_trigger(), @@ -200,7 +200,7 @@ impl VirtioDevice for Block { fn activate( &mut self, mem: GuestMemoryMmap, - interrupt: Arc, + interrupt: Arc, ) -> Result<(), ActivateError> { match self { Self::Virtio(b) => b.activate(mem, interrupt), diff --git a/src/vmm/src/devices/virtio/block/persist.rs b/src/vmm/src/devices/virtio/block/persist.rs index e7ae1768cca..57712a8fb3a 100644 --- a/src/vmm/src/devices/virtio/block/persist.rs +++ b/src/vmm/src/devices/virtio/block/persist.rs @@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize}; use super::vhost_user::persist::VhostUserBlockState; use super::virtio::persist::VirtioBlockState; -use crate::devices::virtio::transport::mmio::IrqTrigger; +use crate::devices::virtio::transport::VirtioInterrupt; use crate::vstate::memory::GuestMemoryMmap; /// Block device state. @@ -21,5 +21,5 @@ pub enum BlockState { #[derive(Debug)] pub struct BlockConstructorArgs { pub mem: GuestMemoryMmap, - pub interrupt: Arc, + pub interrupt: Arc, } diff --git a/src/vmm/src/devices/virtio/block/vhost_user/device.rs b/src/vmm/src/devices/virtio/block/vhost_user/device.rs index 22429996d5f..1d6c2aac080 100644 --- a/src/vmm/src/devices/virtio/block/vhost_user/device.rs +++ b/src/vmm/src/devices/virtio/block/vhost_user/device.rs @@ -4,6 +4,7 @@ // Portions Copyright 2019 Intel Corporation. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +use std::ops::Deref; use std::sync::Arc; use log::error; @@ -14,13 +15,12 @@ use vmm_sys_util::eventfd::EventFd; use super::{NUM_QUEUES, QUEUE_SIZE, VhostUserBlockError}; use crate::devices::virtio::block::CacheType; -use crate::devices::virtio::device::ActiveState; -use crate::devices::virtio::device::{DeviceState, VirtioDevice}; +use crate::devices::virtio::device::{ActiveState, DeviceState, VirtioDevice}; use crate::devices::virtio::generated::virtio_blk::{VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_RO}; use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; use crate::devices::virtio::generated::virtio_ring::VIRTIO_RING_F_EVENT_IDX; use crate::devices::virtio::queue::Queue; -use crate::devices::virtio::transport::mmio::{IrqTrigger, IrqType}; +use crate::devices::virtio::transport::{VirtioInterrupt, VirtioInterruptType}; use crate::devices::virtio::vhost_user::{VhostUserHandleBackend, VhostUserHandleImpl}; use crate::devices::virtio::vhost_user_metrics::{ VhostUserDeviceMetrics, VhostUserMetricsPerDevice, @@ -275,8 +275,8 @@ impl VhostUserBlockImpl { .map_err(VhostUserBlockError::Vhost)?; self.config_space = new_config_space; interrupt - .trigger_irq(IrqType::Config) - .map_err(VhostUserBlockError::IrqTrigger)?; + .trigger(VirtioInterruptType::Config) + .map_err(VhostUserBlockError::Interrupt)?; let delta_us = get_time_us(ClockType::Monotonic) - start_time; self.metrics.config_change_time_us.store(delta_us); @@ -314,12 +314,12 @@ impl VirtioDevice for VhostUserBlock &self.queue_evts } - fn interrupt_trigger(&self) -> &IrqTrigger { - &self - .device_state + fn interrupt_trigger(&self) -> &dyn VirtioInterrupt { + self.device_state .active_state() .expect("Device is not initialized") .interrupt + .deref() } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -341,7 +341,7 @@ impl VirtioDevice for VhostUserBlock fn activate( &mut self, mem: GuestMemoryMmap, - interrupt: Arc, + interrupt: Arc, ) -> Result<(), ActivateError> { for q in self.queues.iter_mut() { q.initialize(&mem) @@ -357,7 +357,7 @@ impl VirtioDevice for VhostUserBlock self.vu_handle.setup_backend( &mem, &[(0, &self.queues[0], &self.queue_evts[0])], - &interrupt, + interrupt.clone(), ) }) .map_err(|err| { diff --git a/src/vmm/src/devices/virtio/block/vhost_user/mod.rs b/src/vmm/src/devices/virtio/block/vhost_user/mod.rs index 8d4d9f44261..0afaaed3400 100644 --- a/src/vmm/src/devices/virtio/block/vhost_user/mod.rs +++ b/src/vmm/src/devices/virtio/block/vhost_user/mod.rs @@ -28,5 +28,5 @@ pub enum VhostUserBlockError { /// Error opening eventfd: {0} EventFd(std::io::Error), /// Error creating irqfd: {0} - IrqTrigger(std::io::Error), + Interrupt(std::io::Error), } diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs index 413410f2af6..d04fd5674ea 100644 --- a/src/vmm/src/devices/virtio/block/virtio/device.rs +++ b/src/vmm/src/devices/virtio/block/virtio/device.rs @@ -9,6 +9,7 @@ use std::cmp; use std::convert::From; use std::fs::{File, OpenOptions}; use std::io::{Seek, SeekFrom}; +use std::ops::Deref; use std::os::linux::fs::MetadataExt; use std::path::PathBuf; use std::sync::Arc; @@ -30,7 +31,7 @@ use crate::devices::virtio::generated::virtio_blk::{ use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; use crate::devices::virtio::generated::virtio_ring::VIRTIO_RING_F_EVENT_IDX; use crate::devices::virtio::queue::{InvalidAvailIdx, Queue}; -use crate::devices::virtio::transport::mmio::{IrqTrigger, IrqType}; +use crate::devices::virtio::transport::{VirtioInterrupt, VirtioInterruptType}; use crate::devices::virtio::{ActivateError, TYPE_BLOCK}; use crate::logger::{IncMetric, error, warn}; use crate::rate_limiter::{BucketUpdate, RateLimiter}; @@ -446,7 +447,7 @@ impl VirtioBlock { if used_any && queue.prepare_kick() { active_state .interrupt - .trigger_irq(IrqType::Vring) + .trigger(VirtioInterruptType::Queue(0)) .unwrap_or_else(|_| { self.metrics.event_fails.inc(); }); @@ -509,7 +510,7 @@ impl VirtioBlock { if queue.prepare_kick() { active_state .interrupt - .trigger_irq(IrqType::Vring) + .trigger(VirtioInterruptType::Queue(0)) .unwrap_or_else(|_| { self.metrics.event_fails.inc(); }); @@ -536,10 +537,12 @@ impl VirtioBlock { self.disk.update(disk_image_path, self.read_only)?; self.config_space.capacity = self.disk.nsectors.to_le(); // virtio_block_config_space(); - // Kick the driver to pick up the changes. - self.interrupt_trigger() - .trigger_irq(IrqType::Config) - .unwrap(); + // Kick the driver to pick up the changes. (But only if the device is already activated). + if self.is_activated() { + self.interrupt_trigger() + .trigger(VirtioInterruptType::Config) + .unwrap(); + } self.metrics.update_count.inc(); Ok(()) @@ -606,12 +609,12 @@ impl VirtioDevice for VirtioBlock { &self.queue_evts } - fn interrupt_trigger(&self) -> &IrqTrigger { - &self - .device_state + fn interrupt_trigger(&self) -> &dyn VirtioInterrupt { + self.device_state .active_state() .expect("Device is not initialized") .interrupt + .deref() } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -643,7 +646,7 @@ impl VirtioDevice for VirtioBlock { fn activate( &mut self, mem: GuestMemoryMmap, - interrupt: Arc, + interrupt: Arc, ) -> Result<(), ActivateError> { for q in self.queues.iter_mut() { q.initialize(&mem) diff --git a/src/vmm/src/devices/virtio/block/virtio/mod.rs b/src/vmm/src/devices/virtio/block/virtio/mod.rs index 8ea59a5aba4..9e97d6d3897 100644 --- a/src/vmm/src/devices/virtio/block/virtio/mod.rs +++ b/src/vmm/src/devices/virtio/block/virtio/mod.rs @@ -57,8 +57,8 @@ pub enum VirtioBlockError { BackingFile(std::io::Error, String), /// Error opening eventfd: {0} EventFd(std::io::Error), - /// Error creating an irqfd: {0} - IrqTrigger(std::io::Error), + /// Error creating an interrupt: {0} + Interrupt(std::io::Error), /// Error coming from the rate limiter: {0} RateLimiter(std::io::Error), /// Persistence error: {0} diff --git a/src/vmm/src/devices/virtio/block/virtio/test_utils.rs b/src/vmm/src/devices/virtio/block/virtio/test_utils.rs index 14e2f1d33d0..e4f23c6a038 100644 --- a/src/vmm/src/devices/virtio/block/virtio/test_utils.rs +++ b/src/vmm/src/devices/virtio/block/virtio/test_utils.rs @@ -21,7 +21,7 @@ use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::queue::{Queue, VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; use crate::devices::virtio::test_utils::{VirtQueue, VirtqDesc}; #[cfg(test)] -use crate::devices::virtio::transport::mmio::IrqType; +use crate::devices::virtio::transport::VirtioInterruptType; use crate::rate_limiter::RateLimiter; use crate::vmm_config::{RateLimiterConfig, TokenBucketConfig}; use crate::vstate::memory::{Bytes, GuestAddress}; @@ -79,13 +79,15 @@ pub fn rate_limiter(blk: &mut VirtioBlock) -> &RateLimiter { #[cfg(test)] pub fn simulate_queue_event(b: &mut VirtioBlock, maybe_expected_irq: Option) { // Trigger the queue event. + b.queue_evts[0].write(1).unwrap(); // Handle event. b.process_queue_event(); // Validate the queue operation finished successfully. if let Some(expected_irq) = maybe_expected_irq { assert_eq!( - b.interrupt_trigger().has_pending_irq(IrqType::Vring), + b.interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(0)), expected_irq ); } @@ -104,7 +106,8 @@ pub fn simulate_async_completion_event(b: &mut VirtioBlock, expected_irq: bool) // Validate if there are pending IRQs. assert_eq!( - b.interrupt_trigger().has_pending_irq(IrqType::Vring), + b.interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(0)), expected_irq ); } diff --git a/src/vmm/src/devices/virtio/device.rs b/src/vmm/src/devices/virtio/device.rs index efcdd7170c5..0b09195d8f7 100644 --- a/src/vmm/src/devices/virtio/device.rs +++ b/src/vmm/src/devices/virtio/device.rs @@ -13,7 +13,7 @@ use vmm_sys_util::eventfd::EventFd; use super::ActivateError; use super::queue::{Queue, QueueError}; -use super::transport::mmio::IrqTrigger; +use super::transport::VirtioInterrupt; use crate::devices::virtio::AsAny; use crate::logger::warn; use crate::vstate::memory::GuestMemoryMmap; @@ -22,7 +22,7 @@ use crate::vstate::memory::GuestMemoryMmap; #[derive(Debug, Clone)] pub struct ActiveState { pub mem: GuestMemoryMmap, - pub interrupt: Arc, + pub interrupt: Arc, } /// Enum that indicates if a VirtioDevice is inactive or has been activated @@ -88,10 +88,10 @@ pub trait VirtioDevice: AsAny + Send { /// Returns the current device interrupt status. fn interrupt_status(&self) -> Arc { - Arc::clone(&self.interrupt_trigger().irq_status) + self.interrupt_trigger().status() } - fn interrupt_trigger(&self) -> &IrqTrigger; + fn interrupt_trigger(&self) -> &dyn VirtioInterrupt; /// The set of feature bits shifted by `page * 32`. fn avail_features_by_page(&self, page: u32) -> u32 { @@ -140,7 +140,7 @@ pub trait VirtioDevice: AsAny + Send { fn activate( &mut self, mem: GuestMemoryMmap, - interrupt: Arc, + interrupt: Arc, ) -> Result<(), ActivateError>; /// Checks if the resources of this device are activated. @@ -205,7 +205,7 @@ pub(crate) mod tests { todo!() } - fn interrupt_trigger(&self) -> &IrqTrigger { + fn interrupt_trigger(&self) -> &dyn VirtioInterrupt { todo!() } @@ -220,7 +220,7 @@ pub(crate) mod tests { fn activate( &mut self, _mem: GuestMemoryMmap, - _interrupt: Arc, + _interrupt: Arc, ) -> Result<(), ActivateError> { todo!() } diff --git a/src/vmm/src/devices/virtio/net/device.rs b/src/vmm/src/devices/virtio/net/device.rs index 9949b404809..cf9f445d5df 100755 --- a/src/vmm/src/devices/virtio/net/device.rs +++ b/src/vmm/src/devices/virtio/net/device.rs @@ -8,6 +8,7 @@ use std::collections::VecDeque; use std::mem::{self}; use std::net::Ipv4Addr; +use std::ops::Deref; use std::sync::{Arc, Mutex}; use libc::{EAGAIN, iovec}; @@ -32,7 +33,7 @@ use crate::devices::virtio::net::{ MAX_BUFFER_SIZE, NET_QUEUE_SIZES, NetError, NetQueue, RX_INDEX, TX_INDEX, generated, }; use crate::devices::virtio::queue::{DescriptorChain, InvalidAvailIdx, Queue}; -use crate::devices::virtio::transport::mmio::{IrqTrigger, IrqType}; +use crate::devices::virtio::transport::{VirtioInterrupt, VirtioInterruptType}; use crate::devices::virtio::{ActivateError, TYPE_NET}; use crate::devices::{DeviceError, report_net_event_fail}; use crate::dumbo::pdu::arp::ETH_IPV4_FRAME_LEN; @@ -390,15 +391,15 @@ impl Net { /// https://docs.oasis-open.org/virtio/virtio/v1.1/csprd01/virtio-v1.1-csprd01.html#x1-320005 /// 2.6.7.1 Driver Requirements: Used Buffer Notification Suppression fn try_signal_queue(&mut self, queue_type: NetQueue) -> Result<(), DeviceError> { - let queue = match queue_type { - NetQueue::Rx => &mut self.queues[RX_INDEX], - NetQueue::Tx => &mut self.queues[TX_INDEX], + let qidx = match queue_type { + NetQueue::Rx => RX_INDEX, + NetQueue::Tx => TX_INDEX, }; - queue.advance_used_ring_idx(); + self.queues[qidx].advance_used_ring_idx(); - if queue.prepare_kick() { + if self.queues[qidx].prepare_kick() { self.interrupt_trigger() - .trigger_irq(IrqType::Vring) + .trigger(VirtioInterruptType::Queue(qidx.try_into().unwrap())) .map_err(|err| { self.metrics.event_fails.inc(); DeviceError::FailedSignalingIrq(err) @@ -966,12 +967,12 @@ impl VirtioDevice for Net { &self.queue_evts } - fn interrupt_trigger(&self) -> &IrqTrigger { - &self - .device_state + fn interrupt_trigger(&self) -> &dyn VirtioInterrupt { + self.device_state .active_state() .expect("Device is not implemented") .interrupt + .deref() } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -1005,7 +1006,7 @@ impl VirtioDevice for Net { fn activate( &mut self, mem: GuestMemoryMmap, - interrupt: Arc, + interrupt: Arc, ) -> Result<(), ActivateError> { for q in self.queues.iter_mut() { q.initialize(&mem) @@ -1066,7 +1067,6 @@ pub mod tests { }; use crate::devices::virtio::queue::VIRTQ_DESC_F_WRITE; use crate::devices::virtio::test_utils::VirtQueue; - use crate::devices::virtio::transport::mmio::IrqType; use crate::dumbo::EthernetFrame; use crate::dumbo::pdu::arp::{ETH_IPV4_FRAME_LEN, EthIPv4ArpFrame}; use crate::dumbo::pdu::ethernet::ETHERTYPE_ARP; @@ -1409,7 +1409,12 @@ pub mod tests { // Check that the used queue has advanced. assert_eq!(th.rxq.used.idx.get(), 4); - assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(RX_INDEX as u16)) + ); + // Check that the invalid descriptor chains have been discarded th.rxq.check_used_elem(0, 0, 0); th.rxq.check_used_elem(1, 3, 0); @@ -1466,7 +1471,11 @@ pub mod tests { assert!(th.net().rx_buffer.used_descriptors == 0); // Check that the used queue has advanced. assert_eq!(th.rxq.used.idx.get(), 1); - assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(RX_INDEX as u16)) + ); // Check that the frame has been written successfully to the Rx descriptor chain. header_set_num_buffers(frame.as_mut_slice(), 1); th.rxq @@ -1529,7 +1538,11 @@ pub mod tests { assert!(th.net().rx_buffer.used_bytes == 0); // Check that the used queue has advanced. assert_eq!(th.rxq.used.idx.get(), 2); - assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(RX_INDEX as u16)) + ); // Check that the 1st frame was written successfully to the 1st Rx descriptor chain. header_set_num_buffers(frame_1.as_mut_slice(), 1); th.rxq @@ -1587,7 +1600,11 @@ pub mod tests { assert!(th.net().rx_buffer.used_bytes == 0); // Check that the used queue has advanced. assert_eq!(th.rxq.used.idx.get(), 2); - assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(RX_INDEX as u16)) + ); // 2 chains should be used for the packet. header_set_num_buffers(frame.as_mut_slice(), 2); @@ -1652,7 +1669,11 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(TX_INDEX as u16)) + ); th.txq.check_used_elem(0, 0, 0); // Check that the frame was skipped. assert!(!tap_traffic_simulator.pop_rx_packet(&mut [])); @@ -1675,7 +1696,11 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(TX_INDEX as u16)) + ); th.txq.check_used_elem(0, 0, 0); // Check that the frame was skipped. assert!(!tap_traffic_simulator.pop_rx_packet(&mut [])); @@ -1702,7 +1727,11 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(TX_INDEX as u16)) + ); th.txq.check_used_elem(0, 0, 0); // Check that the frame was skipped. assert!(!tap_traffic_simulator.pop_rx_packet(&mut [])); @@ -1725,7 +1754,11 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(TX_INDEX as u16)) + ); th.txq.check_used_elem(0, 0, 0); // Check that the frame was skipped. assert!(!tap_traffic_simulator.pop_rx_packet(&mut [])); @@ -1764,7 +1797,11 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 4); - assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(TX_INDEX as u16)) + ); th.txq.check_used_elem(3, 4, 0); // Check that the valid frame was sent to the tap. let mut buf = vec![0; 1000]; @@ -1795,7 +1832,11 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(TX_INDEX as u16)) + ); th.txq.check_used_elem(0, 3, 0); // Check that the frame was sent to the tap. let mut buf = vec![0; 1000]; @@ -1824,7 +1865,11 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 1); - assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(TX_INDEX as u16)) + ); th.txq.check_used_elem(0, 0, 0); // dropping th would double close the tap fd, so leak it @@ -1855,7 +1900,11 @@ pub mod tests { // Check that the used queue advanced. assert_eq!(th.txq.used.idx.get(), 2); - assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(TX_INDEX as u16)) + ); th.txq.check_used_elem(0, 0, 0); th.txq.check_used_elem(1, 3, 0); // Check that the first frame was sent to the tap. @@ -2207,7 +2256,11 @@ pub mod tests { assert_eq!(th.net().metrics.rx_rate_limiter_throttled.count(), 1); assert!(th.net().rx_buffer.used_descriptors != 0); // assert that no operation actually completed (limiter blocked it) - assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(RX_INDEX as u16)) + ); // make sure the data is still queued for processing assert_eq!(th.rxq.used.idx.get(), 0); } @@ -2235,7 +2288,11 @@ pub mod tests { // validate the rate_limiter is no longer blocked assert!(!th.net().rx_rate_limiter.is_blocked()); // make sure the virtio queue operation completed this time - assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(RX_INDEX as u16)) + ); // make sure the data queue advanced assert_eq!(th.rxq.used.idx.get(), 1); th.rxq @@ -2332,14 +2389,22 @@ pub mod tests { assert!(th.net().metrics.rx_rate_limiter_throttled.count() >= 1); assert!(th.net().rx_buffer.used_descriptors != 0); // assert that no operation actually completed (limiter blocked it) - assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(RX_INDEX as u16)) + ); // make sure the data is still queued for processing assert_eq!(th.rxq.used.idx.get(), 0); // trigger the RX handler again, this time it should do the limiter fast path exit th.simulate_event(NetEvent::Tap); // assert that no operation actually completed, that the limiter blocked it - assert!(!th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); + assert!( + !th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(RX_INDEX as u16)) + ); // make sure the data is still queued for processing assert_eq!(th.rxq.used.idx.get(), 0); } @@ -2352,7 +2417,11 @@ pub mod tests { { th.simulate_event(NetEvent::RxRateLimiter); // make sure the virtio queue operation completed this time - assert!(th.net().interrupt_trigger().has_pending_irq(IrqType::Vring)); + assert!( + th.net() + .interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(RX_INDEX as u16)) + ); // make sure the data queue advanced assert_eq!(th.rxq.used.idx.get(), 1); th.rxq @@ -2422,7 +2491,14 @@ pub mod tests { assert_eq!(net.queue_events().len(), NET_QUEUE_SIZES.len()); // Test interrupts. - assert!(!net.interrupt_trigger().has_pending_irq(IrqType::Vring)); + assert!( + !net.interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(RX_INDEX as u16)) + ); + assert!( + !net.interrupt_trigger() + .has_pending_interrupt(VirtioInterruptType::Queue(TX_INDEX as u16)) + ); } #[test] diff --git a/src/vmm/src/devices/virtio/net/persist.rs b/src/vmm/src/devices/virtio/net/persist.rs index 9072d3dd5e7..5ebd15f9d54 100644 --- a/src/vmm/src/devices/virtio/net/persist.rs +++ b/src/vmm/src/devices/virtio/net/persist.rs @@ -13,7 +13,7 @@ use super::{NET_NUM_QUEUES, NET_QUEUE_MAX_SIZE, RX_INDEX, TapError}; use crate::devices::virtio::TYPE_NET; use crate::devices::virtio::device::{ActiveState, DeviceState}; use crate::devices::virtio::persist::{PersistError as VirtioStateError, VirtioDeviceState}; -use crate::devices::virtio::transport::mmio::IrqTrigger; +use crate::devices::virtio::transport::VirtioInterrupt; use crate::mmds::data_store::Mmds; use crate::mmds::ns::MmdsNetworkStack; use crate::mmds::persist::MmdsNetworkStackState; @@ -72,7 +72,7 @@ pub struct NetConstructorArgs { /// Pointer to guest memory. pub mem: GuestMemoryMmap, /// Interrupt for the device. - pub interrupt: Arc, + pub interrupt: Arc, /// Pointer to the MMDS data store. pub mmds: Option>>, } diff --git a/src/vmm/src/devices/virtio/net/test_utils.rs b/src/vmm/src/devices/virtio/net/test_utils.rs index c81ad58205c..b4fbdf97e3f 100644 --- a/src/vmm/src/devices/virtio/net/test_utils.rs +++ b/src/vmm/src/devices/virtio/net/test_utils.rs @@ -300,7 +300,7 @@ pub mod test { use crate::devices::virtio::net::{MAX_BUFFER_SIZE, Net, RX_INDEX, TX_INDEX}; use crate::devices::virtio::queue::{VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; use crate::devices::virtio::test_utils::{VirtQueue, VirtqDesc, default_interrupt}; - use crate::devices::virtio::transport::mmio::IrqType; + use crate::devices::virtio::transport::VirtioInterruptType; use crate::logger::IncMetric; use crate::vstate::memory::{Address, Bytes, GuestAddress, GuestMemoryMmap}; @@ -444,7 +444,7 @@ pub mod test { assert!( self.net() .interrupt_trigger() - .has_pending_irq(IrqType::Vring) + .has_pending_interrupt(VirtioInterruptType::Queue(RX_INDEX as u16)) ); frame @@ -474,7 +474,7 @@ pub mod test { assert!( self.net() .interrupt_trigger() - .has_pending_irq(IrqType::Vring) + .has_pending_interrupt(VirtioInterruptType::Queue(RX_INDEX as u16)) ); self.rxq .check_used_elem(used_idx, 0, expected_frame.len().try_into().unwrap()); diff --git a/src/vmm/src/devices/virtio/rng/device.rs b/src/vmm/src/devices/virtio/rng/device.rs index 1433a7086e2..a0b98cdc8b7 100644 --- a/src/vmm/src/devices/virtio/rng/device.rs +++ b/src/vmm/src/devices/virtio/rng/device.rs @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use std::io; +use std::ops::Deref; use std::sync::Arc; use aws_lc_rs::rand; @@ -16,7 +17,7 @@ use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; use crate::devices::virtio::iov_deque::IovDequeError; use crate::devices::virtio::iovec::IoVecBufferMut; use crate::devices::virtio::queue::{FIRECRACKER_MAX_QUEUE_SIZE, InvalidAvailIdx, Queue}; -use crate::devices::virtio::transport::mmio::{IrqTrigger, IrqType}; +use crate::devices::virtio::transport::{VirtioInterrupt, VirtioInterruptType}; use crate::devices::virtio::{ActivateError, TYPE_RNG}; use crate::logger::{IncMetric, debug, error}; use crate::rate_limiter::{RateLimiter, TokenType}; @@ -87,7 +88,7 @@ impl Entropy { fn signal_used_queue(&self) -> Result<(), DeviceError> { self.interrupt_trigger() - .trigger_irq(IrqType::Vring) + .trigger(VirtioInterruptType::Queue(RNG_QUEUE.try_into().unwrap())) .map_err(DeviceError::FailedSignalingIrq) } @@ -236,7 +237,11 @@ impl Entropy { self.acked_features = features; } - pub(crate) fn set_activated(&mut self, mem: GuestMemoryMmap, interrupt: Arc) { + pub(crate) fn set_activated( + &mut self, + mem: GuestMemoryMmap, + interrupt: Arc, + ) { self.device_state = DeviceState::Activated(ActiveState { mem, interrupt }); } @@ -262,12 +267,12 @@ impl VirtioDevice for Entropy { &self.queue_events } - fn interrupt_trigger(&self) -> &IrqTrigger { - &self - .device_state + fn interrupt_trigger(&self) -> &dyn VirtioInterrupt { + self.device_state .active_state() .expect("Device is not initialized") .interrupt + .deref() } fn avail_features(&self) -> u64 { @@ -293,7 +298,7 @@ impl VirtioDevice for Entropy { fn activate( &mut self, mem: GuestMemoryMmap, - interrupt: Arc, + interrupt: Arc, ) -> Result<(), ActivateError> { for q in self.queues.iter_mut() { q.initialize(&mem) diff --git a/src/vmm/src/devices/virtio/rng/persist.rs b/src/vmm/src/devices/virtio/rng/persist.rs index dd2d62debee..75db947c9c7 100644 --- a/src/vmm/src/devices/virtio/rng/persist.rs +++ b/src/vmm/src/devices/virtio/rng/persist.rs @@ -11,7 +11,7 @@ use crate::devices::virtio::TYPE_RNG; use crate::devices::virtio::persist::{PersistError as VirtioStateError, VirtioDeviceState}; use crate::devices::virtio::queue::FIRECRACKER_MAX_QUEUE_SIZE; use crate::devices::virtio::rng::{Entropy, EntropyError, RNG_NUM_QUEUES}; -use crate::devices::virtio::transport::mmio::IrqTrigger; +use crate::devices::virtio::transport::VirtioInterrupt; use crate::rate_limiter::RateLimiter; use crate::rate_limiter::persist::RateLimiterState; use crate::snapshot::Persist; @@ -26,11 +26,11 @@ pub struct EntropyState { #[derive(Debug)] pub struct EntropyConstructorArgs { mem: GuestMemoryMmap, - interrupt: Arc, + interrupt: Arc, } impl EntropyConstructorArgs { - pub fn new(mem: GuestMemoryMmap, interrupt: Arc) -> Self { + pub fn new(mem: GuestMemoryMmap, interrupt: Arc) -> Self { Self { mem, interrupt } } } diff --git a/src/vmm/src/devices/virtio/test_utils.rs b/src/vmm/src/devices/virtio/test_utils.rs index 29fbdc5ec56..861394c1c7d 100644 --- a/src/vmm/src/devices/virtio/test_utils.rs +++ b/src/vmm/src/devices/virtio/test_utils.rs @@ -10,12 +10,12 @@ use std::sync::Arc; use std::sync::atomic::{AtomicUsize, Ordering}; use crate::devices::virtio::queue::Queue; +use crate::devices::virtio::transport::VirtioInterrupt; +use crate::devices::virtio::transport::mmio::IrqTrigger; use crate::test_utils::single_region_mem; use crate::utils::{align_up, u64_to_usize}; use crate::vstate::memory::{Address, Bytes, GuestAddress, GuestMemoryMmap}; -use super::transport::mmio::IrqTrigger; - #[macro_export] macro_rules! check_metric_after_block { ($metric:expr, $delta:expr, $block:expr) => {{ @@ -32,7 +32,7 @@ pub fn default_mem() -> GuestMemoryMmap { } /// Creates a default ['IrqTrigger'] interrupt for a VirtIO device. -pub fn default_interrupt() -> Arc { +pub fn default_interrupt() -> Arc { Arc::new(IrqTrigger::new()) } diff --git a/src/vmm/src/devices/virtio/transport/mmio.rs b/src/vmm/src/devices/virtio/transport/mmio.rs index 9f72e3fa75a..07cb03fbdbb 100644 --- a/src/vmm/src/devices/virtio/transport/mmio.rs +++ b/src/vmm/src/devices/virtio/transport/mmio.rs @@ -193,7 +193,7 @@ impl MmioTransport { let _ = self .locked_device() .interrupt_trigger() - .trigger_irq(IrqType::Config); + .trigger(VirtioInterruptType::Config); error!("Failed to activate virtio device: {}", err) } @@ -434,7 +434,7 @@ impl IrqTrigger { } } - pub fn trigger_irq(&self, irq_type: IrqType) -> Result<(), std::io::Error> { + fn trigger_irq(&self, irq_type: IrqType) -> Result<(), std::io::Error> { let irq = match irq_type { IrqType::Config => VIRTIO_MMIO_INT_CONFIG, IrqType::Vring => VIRTIO_MMIO_INT_VRING, @@ -453,6 +453,8 @@ impl IrqTrigger { #[cfg(test)] pub(crate) mod tests { + use std::ops::Deref; + use vmm_sys_util::eventfd::EventFd; use super::*; @@ -467,7 +469,7 @@ pub(crate) mod tests { pub(crate) struct DummyDevice { acked_features: u64, avail_features: u64, - interrupt_trigger: Option>, + interrupt_trigger: Option>, queue_evts: Vec, queues: Vec, device_activated: bool, @@ -526,10 +528,11 @@ pub(crate) mod tests { &self.queue_evts } - fn interrupt_trigger(&self) -> &IrqTrigger { + fn interrupt_trigger(&self) -> &dyn VirtioInterrupt { self.interrupt_trigger .as_ref() .expect("Device is not activated") + .deref() } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -545,7 +548,7 @@ pub(crate) mod tests { fn activate( &mut self, _: GuestMemoryMmap, - interrupt: Arc, + interrupt: Arc, ) -> Result<(), ActivateError> { self.device_activated = true; self.interrupt_trigger = Some(interrupt); @@ -985,7 +988,8 @@ pub(crate) mod tests { assert_eq!( d.locked_device() .interrupt_trigger() - .irq_evt + .notifier(VirtioInterruptType::Config) + .unwrap() .read() .unwrap(), 1 diff --git a/src/vmm/src/devices/virtio/vhost_user.rs b/src/vmm/src/devices/virtio/vhost_user.rs index 4f895e5c05e..556a8adafaf 100644 --- a/src/vmm/src/devices/virtio/vhost_user.rs +++ b/src/vmm/src/devices/virtio/vhost_user.rs @@ -6,6 +6,7 @@ use std::os::fd::AsRawFd; use std::os::unix::net::UnixStream; +use std::sync::Arc; use vhost::vhost_user::message::*; use vhost::vhost_user::{Frontend, VhostUserFrontend}; @@ -14,7 +15,7 @@ use vm_memory::{Address, Error as MmapError, GuestMemory, GuestMemoryError, Gues use vmm_sys_util::eventfd::EventFd; use crate::devices::virtio::queue::Queue; -use crate::devices::virtio::transport::mmio::IrqTrigger; +use crate::devices::virtio::transport::{VirtioInterrupt, VirtioInterruptType}; use crate::vstate::memory::GuestMemoryMmap; /// vhost-user error. @@ -400,7 +401,7 @@ impl VhostUserHandleImpl { &mut self, mem: &GuestMemoryMmap, queues: &[(usize, &Queue, &EventFd)], - irq_trigger: &IrqTrigger, + interrupt: Arc, ) -> Result<(), VhostUserError> { // Provide the memory table to the backend. self.update_mem_table(mem)?; @@ -442,7 +443,17 @@ impl VhostUserHandleImpl { // No matter the queue, we set irq_evt for signaling the guest that buffers were // consumed. self.vu - .set_vring_call(*queue_index, &irq_trigger.irq_evt) + .set_vring_call( + *queue_index, + interrupt + .notifier(VirtioInterruptType::Queue( + (*queue_index).try_into().unwrap_or_else(|_| { + panic!("vhost-user: invalid queue index: {}", *queue_index) + }), + )) + .as_ref() + .unwrap(), + ) .map_err(VhostUserError::VhostUserSetVringCall)?; self.vu @@ -467,6 +478,7 @@ pub(crate) mod tests { use vmm_sys_util::tempfile::TempFile; use super::*; + use crate::devices::virtio::test_utils::default_interrupt; use crate::test_utils::create_tmp_socket; use crate::vstate::memory; use crate::vstate::memory::GuestAddress; @@ -901,11 +913,11 @@ pub(crate) mod tests { queue.initialize(&guest_memory).unwrap(); let event_fd = EventFd::new(0).unwrap(); - let irq_trigger = IrqTrigger::new(); let queues = [(0, &queue, &event_fd)]; - vuh.setup_backend(&guest_memory, &queues, &irq_trigger) + let interrupt = default_interrupt(); + vuh.setup_backend(&guest_memory, &queues, interrupt.clone()) .unwrap(); // VhostUserHandleImpl should correctly send memory and queues information to @@ -929,7 +941,11 @@ pub(crate) mod tests { log_addr: None, }, base: queue.avail_ring_idx_get(), - call: irq_trigger.irq_evt.as_raw_fd(), + call: interrupt + .notifier(VirtioInterruptType::Queue(0u16)) + .as_ref() + .unwrap() + .as_raw_fd(), kick: event_fd.as_raw_fd(), enable: true, }; diff --git a/src/vmm/src/devices/virtio/vsock/device.rs b/src/vmm/src/devices/virtio/vsock/device.rs index ad049b517e4..61ca3246d43 100644 --- a/src/vmm/src/devices/virtio/vsock/device.rs +++ b/src/vmm/src/devices/virtio/vsock/device.rs @@ -21,6 +21,7 @@ //! - a backend FD. use std::fmt::Debug; +use std::ops::Deref; use std::sync::Arc; use log::{error, warn}; @@ -34,7 +35,7 @@ use crate::devices::virtio::ActivateError; use crate::devices::virtio::device::{ActiveState, DeviceState, VirtioDevice}; use crate::devices::virtio::generated::virtio_config::{VIRTIO_F_IN_ORDER, VIRTIO_F_VERSION_1}; use crate::devices::virtio::queue::{InvalidAvailIdx, Queue as VirtQueue}; -use crate::devices::virtio::transport::mmio::{IrqTrigger, IrqType}; +use crate::devices::virtio::transport::{VirtioInterrupt, VirtioInterruptType}; use crate::devices::virtio::vsock::VsockError; use crate::devices::virtio::vsock::metrics::METRICS; use crate::logger::IncMetric; @@ -136,12 +137,14 @@ where /// Signal the guest driver that we've used some virtio buffers that it had previously made /// available. - pub fn signal_used_queue(&self) -> Result<(), DeviceError> { + pub fn signal_used_queue(&self, qidx: usize) -> Result<(), DeviceError> { self.device_state .active_state() .expect("Device is not initialized") .interrupt - .trigger_irq(IrqType::Vring) + .trigger(VirtioInterruptType::Queue(qidx.try_into().unwrap_or_else( + |_| panic!("vsock: invalid queue index: {qidx}"), + ))) .map_err(DeviceError::FailedSignalingIrq) } @@ -259,7 +262,7 @@ where }); queue.advance_used_ring_idx(); - self.signal_used_queue()?; + self.signal_used_queue(EVQ_INDEX)?; Ok(()) } @@ -297,12 +300,12 @@ where &self.queue_events } - fn interrupt_trigger(&self) -> &IrqTrigger { - &self - .device_state + fn interrupt_trigger(&self) -> &dyn VirtioInterrupt { + self.device_state .active_state() .expect("Device is not initialized") .interrupt + .deref() } fn read_config(&self, offset: u64, data: &mut [u8]) { @@ -337,7 +340,7 @@ where fn activate( &mut self, mem: GuestMemoryMmap, - interrupt: Arc, + interrupt: Arc, ) -> Result<(), ActivateError> { for q in self.queues.iter_mut() { q.initialize(&mem) diff --git a/src/vmm/src/devices/virtio/vsock/event_handler.rs b/src/vmm/src/devices/virtio/vsock/event_handler.rs index 9c909048a69..a54998ba808 100755 --- a/src/vmm/src/devices/virtio/vsock/event_handler.rs +++ b/src/vmm/src/devices/virtio/vsock/event_handler.rs @@ -197,9 +197,10 @@ where Self::PROCESS_EVQ => raise_irq = self.handle_evq_event(evset), Self::PROCESS_NOTIFY_BACKEND => raise_irq = self.notify_backend(evset).unwrap(), _ => warn!("Unexpected vsock event received: {:?}", source), - } + }; if raise_irq { - self.signal_used_queue().unwrap_or_default(); + self.signal_used_queue(source as usize) + .expect("vsock: Could not trigger device interrupt"); } } else { warn!( diff --git a/src/vmm/src/devices/virtio/vsock/persist.rs b/src/vmm/src/devices/virtio/vsock/persist.rs index 3d0967926be..9d2fd61d9d5 100644 --- a/src/vmm/src/devices/virtio/vsock/persist.rs +++ b/src/vmm/src/devices/virtio/vsock/persist.rs @@ -12,7 +12,7 @@ use super::*; use crate::devices::virtio::device::{ActiveState, DeviceState}; use crate::devices::virtio::persist::VirtioDeviceState; use crate::devices::virtio::queue::FIRECRACKER_MAX_QUEUE_SIZE; -use crate::devices::virtio::transport::mmio::IrqTrigger; +use crate::devices::virtio::transport::VirtioInterrupt; use crate::devices::virtio::vsock::TYPE_VSOCK; use crate::snapshot::Persist; use crate::vstate::memory::GuestMemoryMmap; @@ -54,7 +54,7 @@ pub struct VsockConstructorArgs { /// Pointer to guest memory. pub mem: GuestMemoryMmap, /// Interrupt to use for the device. - pub interrupt: Arc, + pub interrupt: Arc, /// The vsock Unix Backend. pub backend: B, } diff --git a/src/vmm/src/devices/virtio/vsock/test_utils.rs b/src/vmm/src/devices/virtio/vsock/test_utils.rs index 56795e5fd36..b38ce070c66 100644 --- a/src/vmm/src/devices/virtio/vsock/test_utils.rs +++ b/src/vmm/src/devices/virtio/vsock/test_utils.rs @@ -14,7 +14,7 @@ use super::packet::{VsockPacketRx, VsockPacketTx}; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::queue::{VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; use crate::devices::virtio::test_utils::{VirtQueue as GuestQ, default_interrupt}; -use crate::devices::virtio::transport::mmio::IrqTrigger; +use crate::devices::virtio::transport::VirtioInterrupt; use crate::devices::virtio::vsock::device::{RXQ_INDEX, TXQ_INDEX}; use crate::devices::virtio::vsock::packet::VSOCK_PKT_HDR_SIZE; use crate::devices::virtio::vsock::{ @@ -119,7 +119,7 @@ impl VsockBackend for TestBackend {} pub struct TestContext { pub cid: u64, pub mem: GuestMemoryMmap, - pub interrupt: Arc, + pub interrupt: Arc, pub mem_size: usize, pub device: Vsock, } @@ -200,7 +200,7 @@ pub struct EventHandlerContext<'a> { } impl EventHandlerContext<'_> { - pub fn mock_activate(&mut self, mem: GuestMemoryMmap, interrupt: Arc) { + pub fn mock_activate(&mut self, mem: GuestMemoryMmap, interrupt: Arc) { // Artificially activate the device. self.device.activate(mem, interrupt).unwrap(); } From a4e4021b6684f27ae7a01569e59e16496788350c Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 29 Apr 2025 14:58:54 +0200 Subject: [PATCH 06/56] vm-device: add vm-device crate to repo Bring in the vm-device crate from CloudHypervisor. We will be using it for adding PCIe support. Signed-off-by: Babis Chalios --- Cargo.lock | 8 + src/vm-device/Cargo.toml | 16 ++ src/vm-device/src/bus.rs | 407 +++++++++++++++++++++++++++ src/vm-device/src/dma_mapping/mod.rs | 18 ++ src/vm-device/src/interrupt/mod.rs | 194 +++++++++++++ src/vm-device/src/lib.rs | 63 +++++ src/vmm/Cargo.toml | 10 +- 7 files changed, 714 insertions(+), 2 deletions(-) create mode 100644 src/vm-device/Cargo.toml create mode 100644 src/vm-device/src/bus.rs create mode 100644 src/vm-device/src/dma_mapping/mod.rs create mode 100644 src/vm-device/src/interrupt/mod.rs create mode 100644 src/vm-device/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 74155155226..82fa7b8c2d8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1587,6 +1587,14 @@ dependencies = [ "thiserror 2.0.12", ] +[[package]] +name = "vm-device" +version = "0.1.0" +dependencies = [ + "serde", + "vmm-sys-util", +] + [[package]] name = "vm-fdt" version = "0.3.0" diff --git a/src/vm-device/Cargo.toml b/src/vm-device/Cargo.toml new file mode 100644 index 00000000000..b6471ab9f6a --- /dev/null +++ b/src/vm-device/Cargo.toml @@ -0,0 +1,16 @@ +[package] +authors = ["The Cloud Hypervisor Authors"] +edition = "2021" +name = "vm-device" +version = "0.1.0" +license = "Apache-2.0 AND BSD-3-Clause" + +[lib] +bench = false + +[features] +default = [] + +[dependencies] +serde = { version = "1.0.208", features = ["derive", "rc"] } +vmm-sys-util = { version = "0.14.0", features = ["with-serde"] } diff --git a/src/vm-device/src/bus.rs b/src/vm-device/src/bus.rs new file mode 100644 index 00000000000..31880d354bb --- /dev/null +++ b/src/vm-device/src/bus.rs @@ -0,0 +1,407 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. + +//! Handles routing to devices in an address space. + +use std::cmp::Ordering; +use std::collections::btree_map::BTreeMap; +use std::sync::{Arc, Barrier, Mutex, RwLock, Weak}; +use std::{convert, error, fmt, io, result}; + +/// Trait for devices that respond to reads or writes in an arbitrary address space. +/// +/// The device does not care where it exists in address space as each method is only given an offset +/// into its allocated portion of address space. +#[allow(unused_variables)] +pub trait BusDevice: Send { + /// Reads at `offset` from this device + fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) {} + /// Writes at `offset` into this device + fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option> { + None + } +} + +#[allow(unused_variables)] +pub trait BusDeviceSync: Send + Sync { + /// Reads at `offset` from this device + fn read(&self, base: u64, offset: u64, data: &mut [u8]) {} + /// Writes at `offset` into this device + fn write(&self, base: u64, offset: u64, data: &[u8]) -> Option> { + None + } +} + +impl BusDeviceSync for Mutex { + /// Reads at `offset` from this device + fn read(&self, base: u64, offset: u64, data: &mut [u8]) { + self.lock() + .expect("Failed to acquire device lock") + .read(base, offset, data) + } + /// Writes at `offset` into this device + fn write(&self, base: u64, offset: u64, data: &[u8]) -> Option> { + self.lock() + .expect("Failed to acquire device lock") + .write(base, offset, data) + } +} + +#[derive(Debug)] +pub enum Error { + /// The insertion failed because the new device overlapped with an old device. + Overlap, + /// Failed to operate on zero sized range. + ZeroSizedRange, + /// Failed to find address range. + MissingAddressRange, +} + +pub type Result = result::Result; + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "bus_error: {self:?}") + } +} + +impl error::Error for Error {} + +impl convert::From for io::Error { + fn from(e: Error) -> Self { + io::Error::other(e) + } +} + +/// Holds a base and length representing the address space occupied by a `BusDevice`. +/// +/// * base - The address at which the range start. +/// * len - The length of the range in bytes. +#[derive(Debug, Copy, Clone)] +pub struct BusRange { + pub base: u64, + pub len: u64, +} + +impl BusRange { + /// Returns true if there is overlap with the given range. + pub fn overlaps(&self, base: u64, len: u64) -> bool { + self.base < (base + len) && base < self.base + self.len + } +} + +impl Eq for BusRange {} + +impl PartialEq for BusRange { + fn eq(&self, other: &BusRange) -> bool { + self.base == other.base + } +} + +impl Ord for BusRange { + fn cmp(&self, other: &BusRange) -> Ordering { + self.base.cmp(&other.base) + } +} + +impl PartialOrd for BusRange { + fn partial_cmp(&self, other: &BusRange) -> Option { + Some(self.cmp(other)) + } +} + +/// A device container for routing reads and writes over some address space. +/// +/// This doesn't have any restrictions on what kind of device or address space this applies to. The +/// only restriction is that no two devices can overlap in this address space. +#[derive(Default, Debug)] +pub struct Bus { + devices: RwLock>>, +} + +impl Bus { + /// Constructs an a bus with an empty address space. + pub fn new() -> Bus { + Bus { + devices: RwLock::new(BTreeMap::new()), + } + } + + fn first_before(&self, addr: u64) -> Option<(BusRange, Arc)> { + let devices = self.devices.read().unwrap(); + let (range, dev) = devices + .range(..=BusRange { base: addr, len: 1 }) + .next_back()?; + dev.upgrade().map(|d| (*range, d.clone())) + } + + #[allow(clippy::type_complexity)] + pub fn resolve(&self, addr: u64) -> Option<(u64, u64, Arc)> { + if let Some((range, dev)) = self.first_before(addr) { + let offset = addr - range.base; + if offset < range.len { + return Some((range.base, offset, dev)); + } + } + None + } + + pub fn insert(&self, device: Arc, base: u64, len: u64) -> Result<()> { + if len == 0 { + return Err(Error::ZeroSizedRange); + } + + // Reject all cases where the new device's range overlaps with an existing device. + if self + .devices + .read() + .unwrap() + .iter() + .any(|(range, _dev)| range.overlaps(base, len)) + { + return Err(Error::Overlap); + } + + if self + .devices + .write() + .unwrap() + .insert(BusRange { base, len }, Arc::downgrade(&device)) + .is_some() + { + return Err(Error::Overlap); + } + + Ok(()) + } + + /// Removes the device at the given address space range. + pub fn remove(&self, base: u64, len: u64) -> Result<()> { + if len == 0 { + return Err(Error::ZeroSizedRange); + } + + let bus_range = BusRange { base, len }; + + if self.devices.write().unwrap().remove(&bus_range).is_none() { + return Err(Error::MissingAddressRange); + } + + Ok(()) + } + + /// Removes all entries referencing the given device. + pub fn remove_by_device(&self, device: &Arc) -> Result<()> { + let mut device_list = self.devices.write().unwrap(); + let mut remove_key_list = Vec::new(); + + for (key, value) in device_list.iter() { + if Arc::ptr_eq(&value.upgrade().unwrap(), device) { + remove_key_list.push(*key); + } + } + + for key in remove_key_list.iter() { + device_list.remove(key); + } + + Ok(()) + } + + /// Updates the address range for an existing device. + pub fn update_range( + &self, + old_base: u64, + old_len: u64, + new_base: u64, + new_len: u64, + ) -> Result<()> { + // Retrieve the device corresponding to the range + let device = if let Some((_, _, dev)) = self.resolve(old_base) { + dev.clone() + } else { + return Err(Error::MissingAddressRange); + }; + + // Remove the old address range + self.remove(old_base, old_len)?; + + // Insert the new address range + self.insert(device, new_base, new_len) + } + + /// Reads data from the device that owns the range containing `addr` and puts it into `data`. + /// + /// Returns true on success, otherwise `data` is untouched. + pub fn read(&self, addr: u64, data: &mut [u8]) -> Result<()> { + if let Some((base, offset, dev)) = self.resolve(addr) { + // OK to unwrap as lock() failing is a serious error condition and should panic. + dev.read(base, offset, data); + Ok(()) + } else { + Err(Error::MissingAddressRange) + } + } + + /// Writes `data` to the device that owns the range containing `addr`. + /// + /// Returns true on success, otherwise `data` is untouched. + pub fn write(&self, addr: u64, data: &[u8]) -> Result>> { + if let Some((base, offset, dev)) = self.resolve(addr) { + // OK to unwrap as lock() failing is a serious error condition and should panic. + Ok(dev.write(base, offset, data)) + } else { + Err(Error::MissingAddressRange) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + struct DummyDevice; + impl BusDeviceSync for DummyDevice {} + + struct ConstantDevice; + impl BusDeviceSync for ConstantDevice { + fn read(&self, _base: u64, offset: u64, data: &mut [u8]) { + for (i, v) in data.iter_mut().enumerate() { + *v = (offset as u8) + (i as u8); + } + } + + fn write(&self, _base: u64, offset: u64, data: &[u8]) -> Option> { + for (i, v) in data.iter().enumerate() { + assert_eq!(*v, (offset as u8) + (i as u8)) + } + + None + } + } + + #[test] + fn bus_insert() { + let bus = Bus::new(); + let dummy = Arc::new(DummyDevice); + bus.insert(dummy.clone(), 0x10, 0).unwrap_err(); + bus.insert(dummy.clone(), 0x10, 0x10).unwrap(); + + let result = bus.insert(dummy.clone(), 0x0f, 0x10); + assert_eq!(format!("{result:?}"), "Err(Overlap)"); + + bus.insert(dummy.clone(), 0x10, 0x10).unwrap_err(); + bus.insert(dummy.clone(), 0x10, 0x15).unwrap_err(); + bus.insert(dummy.clone(), 0x12, 0x15).unwrap_err(); + bus.insert(dummy.clone(), 0x12, 0x01).unwrap_err(); + bus.insert(dummy.clone(), 0x0, 0x20).unwrap_err(); + bus.insert(dummy.clone(), 0x20, 0x05).unwrap(); + bus.insert(dummy.clone(), 0x25, 0x05).unwrap(); + bus.insert(dummy, 0x0, 0x10).unwrap(); + } + + #[test] + fn bus_remove() { + let bus = Bus::new(); + let dummy: Arc = Arc::new(DummyDevice); + + bus.remove(0x42, 0x0).unwrap_err(); + + bus.remove(0x13, 0x12).unwrap_err(); + + bus.insert(dummy.clone(), 0x13, 0x12).unwrap(); + bus.remove(0x42, 0x42).unwrap_err(); + bus.remove(0x13, 0x12).unwrap(); + + bus.insert(dummy.clone(), 0x16, 0x1).unwrap(); + bus.remove_by_device(&dummy).unwrap(); + bus.remove(0x16, 0x1).unwrap_err(); + } + + #[test] + #[allow(clippy::redundant_clone)] + fn bus_read_write() { + let bus = Bus::new(); + let dummy = Arc::new(DummyDevice); + bus.insert(dummy.clone(), 0x10, 0x10).unwrap(); + bus.read(0x10, &mut [0, 0, 0, 0]).unwrap(); + bus.write(0x10, &[0, 0, 0, 0]).unwrap(); + bus.read(0x11, &mut [0, 0, 0, 0]).unwrap(); + bus.write(0x11, &[0, 0, 0, 0]).unwrap(); + bus.read(0x16, &mut [0, 0, 0, 0]).unwrap(); + bus.write(0x16, &[0, 0, 0, 0]).unwrap(); + bus.read(0x20, &mut [0, 0, 0, 0]).unwrap_err(); + bus.write(0x20, &[0, 0, 0, 0]).unwrap_err(); + bus.read(0x06, &mut [0, 0, 0, 0]).unwrap_err(); + bus.write(0x06, &[0, 0, 0, 0]).unwrap_err(); + } + + #[test] + #[allow(clippy::redundant_clone)] + fn bus_read_write_values() { + let bus = Bus::new(); + let dummy = Arc::new(ConstantDevice); + bus.insert(dummy.clone(), 0x10, 0x10).unwrap(); + + let mut values = [0, 1, 2, 3]; + bus.read(0x10, &mut values).unwrap(); + assert_eq!(values, [0, 1, 2, 3]); + bus.write(0x10, &values).unwrap(); + bus.read(0x15, &mut values).unwrap(); + assert_eq!(values, [5, 6, 7, 8]); + bus.write(0x15, &values).unwrap(); + } + + #[test] + #[allow(clippy::redundant_clone)] + fn busrange_cmp() { + let range = BusRange { base: 0x10, len: 2 }; + assert_eq!(range, BusRange { base: 0x10, len: 3 }); + assert_eq!(range, BusRange { base: 0x10, len: 2 }); + + assert!(range < BusRange { base: 0x12, len: 1 }); + assert!(range < BusRange { base: 0x12, len: 3 }); + + assert_eq!(range, range.clone()); + + let bus = Bus::new(); + let mut data = [1, 2, 3, 4]; + let device = Arc::new(DummyDevice); + bus.insert(device.clone(), 0x10, 0x10).unwrap(); + bus.write(0x10, &data).unwrap(); + bus.read(0x10, &mut data).unwrap(); + assert_eq!(data, [1, 2, 3, 4]); + } + + #[test] + fn bus_range_overlap() { + let a = BusRange { + base: 0x1000, + len: 0x400, + }; + assert!(a.overlaps(0x1000, 0x400)); + assert!(a.overlaps(0xf00, 0x400)); + assert!(a.overlaps(0x1000, 0x01)); + assert!(a.overlaps(0xfff, 0x02)); + assert!(a.overlaps(0x1100, 0x100)); + assert!(a.overlaps(0x13ff, 0x100)); + assert!(!a.overlaps(0x1400, 0x100)); + assert!(!a.overlaps(0xf00, 0x100)); + } + + #[test] + fn bus_update_range() { + let bus = Bus::new(); + let dummy = Arc::new(DummyDevice); + + bus.update_range(0x13, 0x12, 0x16, 0x1).unwrap_err(); + bus.insert(dummy.clone(), 0x13, 12).unwrap(); + + bus.update_range(0x16, 0x1, 0x13, 0x12).unwrap_err(); + bus.update_range(0x13, 0x12, 0x16, 0x1).unwrap(); + } +} diff --git a/src/vm-device/src/dma_mapping/mod.rs b/src/vm-device/src/dma_mapping/mod.rs new file mode 100644 index 00000000000..6cba6e16488 --- /dev/null +++ b/src/vm-device/src/dma_mapping/mod.rs @@ -0,0 +1,18 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// Copyright © 2021 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause + +/// Trait to trigger DMA mapping updates for devices managed by virtio-iommu +/// +/// Trait meant for triggering the DMA mapping update related to an external +/// device not managed fully through virtio. It is dedicated to virtio-iommu +/// in order to trigger the map update anytime the mapping is updated from the +/// guest. +pub trait ExternalDmaMapping: Send + Sync { + /// Map a memory range + fn map(&self, iova: u64, gpa: u64, size: u64) -> std::result::Result<(), std::io::Error>; + + /// Unmap a memory range + fn unmap(&self, iova: u64, size: u64) -> std::result::Result<(), std::io::Error>; +} diff --git a/src/vm-device/src/interrupt/mod.rs b/src/vm-device/src/interrupt/mod.rs new file mode 100644 index 00000000000..f4aec52a2e0 --- /dev/null +++ b/src/vm-device/src/interrupt/mod.rs @@ -0,0 +1,194 @@ +// Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// Copyright (C) 2019 Alibaba Cloud. All rights reserved. +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause + +//! Traits and Structs to manage interrupt sources for devices. +//! +//! In system programming, an interrupt is a signal to the processor emitted by hardware or +//! software indicating an event that needs immediate attention. An interrupt alerts the processor +//! to a high-priority condition requiring the interruption of the current code the processor is +//! executing. The processor responds by suspending its current activities, saving its state, and +//! executing a function called an interrupt handler (or an interrupt service routine, ISR) to deal +//! with the event. This interruption is temporary, and, after the interrupt handler finishes, +//! unless handling the interrupt has emitted a fatal error, the processor resumes normal +//! activities. +//! +//! Hardware interrupts are used by devices to communicate that they require attention from the +//! operating system, or a bare-metal program running on the CPU if there are no OSes. The act of +//! initiating a hardware interrupt is referred to as an interrupt request (IRQ). Different devices +//! are usually associated with different interrupts using a unique value associated with each +//! interrupt. This makes it possible to know which hardware device caused which interrupts. +//! These interrupt values are often called IRQ lines, or just interrupt lines. +//! +//! Nowadays, IRQ lines is not the only mechanism to deliver device interrupts to processors. +//! MSI [(Message Signaled Interrupt)](https://en.wikipedia.org/wiki/Message_Signaled_Interrupts) +//! is another commonly used alternative in-band method of signaling an interrupt, using special +//! in-band messages to replace traditional out-of-band assertion of dedicated interrupt lines. +//! While more complex to implement in a device, message signaled interrupts have some significant +//! advantages over pin-based out-of-band interrupt signaling. Message signaled interrupts are +//! supported in PCI bus since its version 2.2, and in later available PCI Express bus. Some +//! non-PCI architectures also use message signaled interrupts. +//! +//! While IRQ is a term commonly used by Operating Systems when dealing with hardware +//! interrupts, the IRQ numbers managed by OSes are independent of the ones managed by VMM. +//! For simplicity sake, the term `Interrupt Source` is used instead of IRQ to represent both +//! pin-based interrupts and MSI interrupts. +//! +//! A device may support multiple types of interrupts, and each type of interrupt may support one +//! or multiple interrupt sources. For example, a PCI device may support: +//! * Legacy Irq: exactly one interrupt source. +//! * PCI MSI Irq: 1,2,4,8,16,32 interrupt sources. +//! * PCI MSIx Irq: 2^n(n=0-11) interrupt sources. +//! +//! A distinct Interrupt Source Identifier (ISID) will be assigned to each interrupt source. +//! An ID allocator will be used to allocate and free Interrupt Source Identifiers for devices. +//! To decouple the vm-device crate from the ID allocator, the vm-device crate doesn't take the +//! responsibility to allocate/free Interrupt Source IDs but only makes use of assigned IDs. +//! +//! The overall flow to deal with interrupts is: +//! * The VMM creates an interrupt manager +//! * The VMM creates a device manager, passing on an reference to the interrupt manager +//! * The device manager passes on an reference to the interrupt manager to all registered devices +//! * The guest kernel loads drivers for virtual devices +//! * The guest device driver determines the type and number of interrupts needed, and update the +//! device configuration +//! * The virtual device backend requests the interrupt manager to create an interrupt group +//! according to guest configuration information + +use std::sync::Arc; + +use vmm_sys_util::eventfd::EventFd; + +/// Reuse std::io::Result to simplify interoperability among crates. +pub type Result = std::io::Result; + +/// Data type to store an interrupt source identifier. +pub type InterruptIndex = u32; + +/// Configuration data for legacy interrupts. +/// +/// On x86 platforms, legacy interrupts means those interrupts routed through PICs or IOAPICs. +#[derive(Copy, Clone, Debug)] +pub struct LegacyIrqSourceConfig { + pub irqchip: u32, + pub pin: u32, +} + +/// Configuration data for MSI/MSI-X interrupts. +/// +/// On x86 platforms, these interrupts are vectors delivered directly to the LAPIC. +#[derive(Copy, Clone, Debug, Default)] +pub struct MsiIrqSourceConfig { + /// High address to delivery message signaled interrupt. + pub high_addr: u32, + /// Low address to delivery message signaled interrupt. + pub low_addr: u32, + /// Data to write to delivery message signaled interrupt. + pub data: u32, + /// Unique ID of the device to delivery message signaled interrupt. + pub devid: u32, +} + +/// Configuration data for an interrupt source. +#[derive(Copy, Clone, Debug)] +pub enum InterruptSourceConfig { + /// Configuration data for Legacy interrupts. + LegacyIrq(LegacyIrqSourceConfig), + /// Configuration data for PciMsi, PciMsix and generic MSI interrupts. + MsiIrq(MsiIrqSourceConfig), +} + +/// Configuration data for legacy, pin based interrupt groups. +/// +/// A legacy interrupt group only takes one irq number as its configuration. +#[derive(Copy, Clone, Debug)] +pub struct LegacyIrqGroupConfig { + /// Legacy irq number. + pub irq: InterruptIndex, +} + +/// Configuration data for MSI/MSI-X interrupt groups +/// +/// MSI/MSI-X interrupt groups are basically a set of vectors. +#[derive(Copy, Clone, Debug)] +pub struct MsiIrqGroupConfig { + /// First index of the MSI/MSI-X interrupt vectors + pub base: InterruptIndex, + /// Number of vectors in the MSI/MSI-X group. + pub count: InterruptIndex, +} + +/// Trait to manage interrupt sources for virtual device backends. +/// +/// The InterruptManager implementations should protect itself from concurrent accesses internally, +/// so it could be invoked from multi-threaded context. +pub trait InterruptManager: Send + Sync { + type GroupConfig; + + /// Create an [InterruptSourceGroup](trait.InterruptSourceGroup.html) object to manage + /// interrupt sources for a virtual device + /// + /// An [InterruptSourceGroup](trait.InterruptSourceGroup.html) object manages all interrupt + /// sources of the same type for a virtual device. + /// + /// # Arguments + /// * interrupt_type: type of interrupt source. + /// * base: base Interrupt Source ID to be managed by the group object. + /// * count: number of Interrupt Sources to be managed by the group object. + fn create_group(&self, config: Self::GroupConfig) -> Result>; + + /// Destroy an [InterruptSourceGroup](trait.InterruptSourceGroup.html) object created by + /// [create_group()](trait.InterruptManager.html#tymethod.create_group). + /// + /// Assume the caller takes the responsibility to disable all interrupt sources of the group + /// before calling destroy_group(). This assumption helps to simplify InterruptSourceGroup + /// implementations. + fn destroy_group(&self, group: Arc) -> Result<()>; +} + +pub trait InterruptSourceGroup: Send + Sync { + /// Enable the interrupt sources in the group to generate interrupts. + fn enable(&self) -> Result<()> { + // Not all interrupt sources can be enabled. + // To accommodate this, we can have a no-op here. + Ok(()) + } + + /// Disable the interrupt sources in the group to generate interrupts. + fn disable(&self) -> Result<()> { + // Not all interrupt sources can be disabled. + // To accommodate this, we can have a no-op here. + Ok(()) + } + + /// Inject an interrupt from this interrupt source into the guest. + fn trigger(&self, index: InterruptIndex) -> Result<()>; + + /// Returns an interrupt notifier from this interrupt. + /// + /// An interrupt notifier allows for external components and processes + /// to inject interrupts into a guest, by writing to the file returned + /// by this method. + #[allow(unused_variables)] + fn notifier(&self, index: InterruptIndex) -> Option; + + /// Update the interrupt source group configuration. + /// + /// # Arguments + /// * index: sub-index into the group. + /// * config: configuration data for the interrupt source. + /// * masked: if the interrupt is masked + /// * set_gsi: whether update the GSI routing table. + fn update( + &self, + index: InterruptIndex, + config: InterruptSourceConfig, + masked: bool, + set_gsi: bool, + ) -> Result<()>; + + /// Set the interrupt group GSI routing table. + fn set_gsi(&self) -> Result<()>; +} diff --git a/src/vm-device/src/lib.rs b/src/vm-device/src/lib.rs new file mode 100644 index 00000000000..fe06fd8b465 --- /dev/null +++ b/src/vm-device/src/lib.rs @@ -0,0 +1,63 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Copyright © 2020 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 +// + +use serde::{Deserialize, Serialize}; + +mod bus; +pub mod dma_mapping; +pub mod interrupt; + +pub use self::bus::{Bus, BusDevice, BusDeviceSync, Error as BusError}; + +/// Type of Message Signalled Interrupt +#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub enum MsiIrqType { + /// PCI MSI IRQ numbers. + PciMsi, + /// PCI MSIx IRQ numbers. + PciMsix, + /// Generic MSI IRQ numbers. + GenericMsi, +} + +#[derive(Copy, Clone, PartialEq, Eq, Serialize, Deserialize, Debug)] +pub enum PciBarType { + Io, + Mmio32, + Mmio64, +} + +/// Enumeration for device resources. +#[allow(missing_docs)] +#[derive(Clone, Debug, Serialize, Deserialize)] +pub enum Resource { + /// IO Port address range. + PioAddressRange { base: u16, size: u16 }, + /// Memory Mapped IO address range. + MmioAddressRange { base: u64, size: u64 }, + /// PCI BAR + PciBar { + index: usize, + base: u64, + size: u64, + type_: PciBarType, + prefetchable: bool, + }, + /// Legacy IRQ number. + LegacyIrq(u32), + /// Message Signaled Interrupt + MsiIrq { + ty: MsiIrqType, + base: u32, + size: u32, + }, + /// Network Interface Card MAC address. + MacAddress(String), + /// KVM memslot index. + KvmMemSlot(u32), +} diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index 16091cf724a..995f314e46f 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -22,7 +22,10 @@ base64 = "0.22.1" bincode = { version = "2.0.1", features = ["serde"] } bitflags = "2.9.1" crc64 = "2.0.0" -derive_more = { version = "2.0.1", default-features = false, features = ["from", "display"] } +derive_more = { version = "2.0.1", default-features = false, features = [ + "from", + "display", +] } displaydoc = "0.2.5" event-manager = "0.4.1" gdbstub = { version = "0.7.6", optional = true } @@ -45,7 +48,10 @@ userfaultfd = "0.8.1" utils = { path = "../utils" } vhost = { version = "0.14.0", features = ["vhost-user-frontend"] } vm-allocator = "0.1.3" -vm-memory = { version = "0.16.2", features = ["backend-mmap", "backend-bitmap"] } +vm-memory = { version = "0.16.2", features = [ + "backend-mmap", + "backend-bitmap", +] } vm-superio = "0.8.0" vmm-sys-util = { version = "0.14.0", features = ["with-serde"] } zerocopy = { version = "0.8.26" } From eadaba1daaa25c8087b9643f2ec1e675775236c3 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Thu, 24 Apr 2025 16:17:13 +0200 Subject: [PATCH 07/56] refactor(serial): remove generics from SerialDevice We use `SerialDevice` with Stdin as the input source. Encode this in the type so that we don't spill the generic all over the place. Signed-off-by: Babis Chalios Co-authored-by: Egor Lazarchuk Signed-off-by: Egor Lazarchuk Signed-off-by: Babis Chalios --- src/vmm/src/builder.rs | 45 +++++++++------------------ src/vmm/src/device_manager/persist.rs | 7 ++--- src/vmm/src/devices/bus.rs | 6 ++-- src/vmm/src/devices/legacy/serial.rs | 27 ++++++++++++++-- 4 files changed, 43 insertions(+), 42 deletions(-) diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index ba54929d451..db4f11b7a26 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -10,7 +10,6 @@ use std::sync::mpsc; use std::sync::{Arc, Mutex}; use event_manager::{MutEventSubscriber, SubscriberOps}; -use libc::EFD_NONBLOCK; use linux_loader::cmdline::Cmdline as LoaderKernelCmdline; use userfaultfd::Uffd; use utils::time::TimestampUs; @@ -18,8 +17,6 @@ use utils::time::TimestampUs; use vm_memory::GuestAddress; #[cfg(target_arch = "aarch64")] use vm_superio::Rtc; -use vm_superio::Serial; -use vmm_sys_util::eventfd::EventFd; use crate::arch::{ConfigurationError, configure_system_for_boot, load_kernel}; #[cfg(target_arch = "aarch64")] @@ -39,8 +36,8 @@ use crate::devices::BusDevice; use crate::devices::acpi::vmgenid::{VmGenId, VmGenIdError}; #[cfg(target_arch = "aarch64")] use crate::devices::legacy::RTCDevice; +use crate::devices::legacy::SerialDevice; use crate::devices::legacy::serial::SerialOut; -use crate::devices::legacy::{EventFdTrigger, SerialEventsWrapper, SerialWrapper}; use crate::devices::virtio::balloon::Balloon; use crate::devices::virtio::block::device::Block; use crate::devices::virtio::device::VirtioDevice; @@ -163,7 +160,7 @@ fn create_vmm_and_vcpus( set_stdout_nonblocking(); // Serial device setup. - let serial_device = setup_serial_device(event_manager, std::io::stdin(), io::stdout())?; + let serial_device = setup_serial_device(event_manager)?; // x86_64 uses the i8042 reset event as the Vmm exit event. let reset_evt = vcpus_exit_evt.try_clone().map_err(VmmError::EventFd)?; @@ -554,22 +551,11 @@ pub fn build_microvm_from_snapshot( /// Sets up the serial device. pub fn setup_serial_device( event_manager: &mut EventManager, - input: std::io::Stdin, - out: std::io::Stdout, ) -> Result>, VmmError> { - let interrupt_evt = EventFdTrigger::new(EventFd::new(EFD_NONBLOCK).map_err(VmmError::EventFd)?); - let kick_stdin_read_evt = - EventFdTrigger::new(EventFd::new(EFD_NONBLOCK).map_err(VmmError::EventFd)?); - let serial = Arc::new(Mutex::new(BusDevice::Serial(SerialWrapper { - serial: Serial::with_events( - interrupt_evt, - SerialEventsWrapper { - buffer_ready_event_fd: Some(kick_stdin_read_evt), - }, - SerialOut::Stdout(out), - ), - input: Some(input), - }))); + let serial = Arc::new(Mutex::new(BusDevice::Serial( + SerialDevice::new(Some(std::io::stdin()), SerialOut::Stdout(std::io::stdout())) + .map_err(VmmError::EventFd)?, + ))); event_manager.add_subscriber(serial.clone()); Ok(serial) } @@ -629,7 +615,7 @@ fn attach_legacy_devices_aarch64( if cmdline_contains_console { // Make stdout non-blocking. set_stdout_nonblocking(); - let serial = setup_serial_device(event_manager, std::io::stdin(), std::io::stdout())?; + let serial = setup_serial_device(event_manager)?; vmm.mmio_device_manager .register_mmio_serial(vmm.vm.fd(), &mut vmm.resource_allocator, serial, None) .map_err(VmmError::RegisterMMIODevice)?; @@ -809,11 +795,15 @@ pub(crate) fn set_stdout_nonblocking() { pub(crate) mod tests { use linux_loader::cmdline::Cmdline; + #[cfg(target_arch = "x86_64")] + use vmm_sys_util::eventfd::EventFd; use vmm_sys_util::tempfile::TempFile; use super::*; use crate::arch::DeviceType; use crate::device_manager::resources::ResourceAllocator; + #[cfg(target_arch = "x86_64")] + use crate::devices::legacy::serial::SerialOut; use crate::devices::virtio::block::CacheType; use crate::devices::virtio::rng::device::ENTROPY_DEV_ID; use crate::devices::virtio::vsock::{TYPE_VSOCK, VSOCK_DEV_ID}; @@ -890,16 +880,9 @@ pub(crate) mod tests { let acpi_device_manager = ACPIDeviceManager::new(); #[cfg(target_arch = "x86_64")] let pio_device_manager = PortIODeviceManager::new( - Arc::new(Mutex::new(BusDevice::Serial(SerialWrapper { - serial: Serial::with_events( - EventFdTrigger::new(EventFd::new(EFD_NONBLOCK).unwrap()), - SerialEventsWrapper { - buffer_ready_event_fd: None, - }, - SerialOut::Sink(std::io::sink()), - ), - input: None, - }))), + Arc::new(Mutex::new(BusDevice::Serial( + SerialDevice::new(None, SerialOut::Sink(std::io::sink())).unwrap(), + ))), EventFd::new(libc::EFD_NONBLOCK).unwrap(), ) .unwrap(); diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 2f331e644ad..34f0bbc5530 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -421,11 +421,8 @@ impl<'a> Persist<'a> for MMIODeviceManager { { for state in &state.legacy_devices { if state.type_ == DeviceType::Serial { - let serial = crate::builder::setup_serial_device( - constructor_args.event_manager, - std::io::stdin(), - std::io::stdout(), - )?; + let serial = + crate::builder::setup_serial_device(constructor_args.event_manager)?; constructor_args .resource_allocator diff --git a/src/vmm/src/devices/bus.rs b/src/vmm/src/devices/bus.rs index d0e1b296998..516b40cc93f 100644 --- a/src/vmm/src/devices/bus.rs +++ b/src/vmm/src/devices/bus.rs @@ -65,7 +65,7 @@ pub enum BusDevice { RTCDevice(RTCDevice), BootTimer(BootTimer), MmioTransport(MmioTransport), - Serial(SerialDevice), + Serial(SerialDevice), #[cfg(test)] Dummy(DummyDevice), #[cfg(test)] @@ -127,7 +127,7 @@ impl BusDevice { _ => None, } } - pub fn serial_ref(&self) -> Option<&SerialDevice> { + pub fn serial_ref(&self) -> Option<&SerialDevice> { match self { Self::Serial(x) => Some(x), _ => None, @@ -159,7 +159,7 @@ impl BusDevice { _ => None, } } - pub fn serial_mut(&mut self) -> Option<&mut SerialDevice> { + pub fn serial_mut(&mut self) -> Option<&mut SerialDevice> { match self { Self::Serial(x) => Some(x), _ => None, diff --git a/src/vmm/src/devices/legacy/serial.rs b/src/vmm/src/devices/legacy/serial.rs index 278c15a4464..c73534e76c4 100644 --- a/src/vmm/src/devices/legacy/serial.rs +++ b/src/vmm/src/devices/legacy/serial.rs @@ -7,16 +7,17 @@ //! Implements a wrapper over an UART serial device. use std::fmt::Debug; -use std::io; -use std::io::{Read, Write}; +use std::io::{self, Read, Stdin, Write}; use std::os::unix::io::{AsRawFd, RawFd}; use event_manager::{EventOps, Events, MutEventSubscriber}; +use libc::EFD_NONBLOCK; use log::{error, warn}; use serde::Serialize; use vm_superio::serial::{Error as SerialError, SerialEvents}; use vm_superio::{Serial, Trigger}; use vmm_sys_util::epoll::EventSet; +use vmm_sys_util::eventfd::EventFd; use crate::devices::legacy::EventFdTrigger; use crate::logger::{IncMetric, SharedIncMetric}; @@ -220,7 +221,27 @@ impl SerialWrapper = SerialWrapper; +pub type SerialDevice = SerialWrapper; + +impl SerialDevice { + pub fn new(serial_in: Option, serial_out: SerialOut) -> Result { + let interrupt_evt = EventFdTrigger::new(EventFd::new(EFD_NONBLOCK)?); + let buffer_read_event_fd = EventFdTrigger::new(EventFd::new(EFD_NONBLOCK)?); + + let serial = Serial::with_events( + interrupt_evt, + SerialEventsWrapper { + buffer_ready_event_fd: Some(buffer_read_event_fd), + }, + serial_out, + ); + + Ok(SerialDevice { + serial, + input: serial_in, + }) + } +} impl MutEventSubscriber for SerialWrapper From 536588d5f05a5859bf5c14dd5965e4f7ea3714c0 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Thu, 24 Apr 2025 15:24:47 +0200 Subject: [PATCH 08/56] refactor: use vm_device::Bus as the MMIO bus Use the vm_device::Bus bus for all MMIO devices. This is mainly to prepare for using it for PCIe devices. Also, sepate VirtIO devices from other MMIO devices inside the MMIODeviceManager struct. This makes iterating over VirtIO devices easier since we don't need to access two data structures to get a reference to a VirtIO device any more. Signed-off-by: Babis Chalios --- src/vmm/src/arch/aarch64/fdt.rs | 96 ++--- src/vmm/src/arch/aarch64/mod.rs | 4 +- src/vmm/src/arch/aarch64/vcpu.rs | 3 +- src/vmm/src/arch/x86_64/vcpu.rs | 3 +- src/vmm/src/builder.rs | 37 +- src/vmm/src/device_manager/mmio.rs | 415 ++++++++++--------- src/vmm/src/device_manager/persist.rs | 90 ++-- src/vmm/src/devices/bus.rs | 68 +-- src/vmm/src/devices/legacy/rtc_pl031.rs | 19 +- src/vmm/src/devices/legacy/serial.rs | 19 + src/vmm/src/devices/pseudo/boot_timer.rs | 15 +- src/vmm/src/devices/virtio/transport/mmio.rs | 128 +++--- src/vmm/src/lib.rs | 74 +--- src/vmm/src/persist.rs | 7 +- src/vmm/src/vstate/vcpu.rs | 17 +- 15 files changed, 493 insertions(+), 502 deletions(-) diff --git a/src/vmm/src/arch/aarch64/fdt.rs b/src/vmm/src/arch/aarch64/fdt.rs index 61200cb2148..54c8a30225c 100644 --- a/src/vmm/src/arch/aarch64/fdt.rs +++ b/src/vmm/src/arch/aarch64/fdt.rs @@ -5,14 +5,12 @@ // Use of this source code is governed by a BSD-style license that can be // found in the THIRD-PARTY file. -use std::collections::HashMap; use std::ffi::CString; use std::fmt::Debug; use vm_fdt::{Error as VmFdtError, FdtWriter, FdtWriterNode}; use vm_memory::GuestMemoryError; -use super::super::DeviceType; use super::cache_info::{CacheEntry, read_cache_config}; use super::gic::GICDevice; use crate::device_manager::mmio::MMIODeviceInfo; @@ -55,12 +53,15 @@ pub enum FdtError { WriteFdtToMemory(#[from] GuestMemoryError), } +#[allow(clippy::too_many_arguments)] /// Creates the flattened device tree for this aarch64 microVM. pub fn create_fdt( guest_mem: &GuestMemoryMmap, vcpu_mpidr: Vec, cmdline: CString, - device_info: &HashMap<(DeviceType, String), MMIODeviceInfo>, + virtio_devices: Vec<&MMIODeviceInfo>, + rtc: Option<&MMIODeviceInfo>, + serial: Option<&MMIODeviceInfo>, gic_device: &GICDevice, vmgenid: &Option, initrd: &Option, @@ -89,7 +90,7 @@ pub fn create_fdt( create_timer_node(&mut fdt_writer)?; create_clock_node(&mut fdt_writer)?; create_psci_node(&mut fdt_writer)?; - create_devices_node(&mut fdt_writer, device_info)?; + create_devices_node(&mut fdt_writer, virtio_devices, rtc, serial)?; create_vmgenid_node(&mut fdt_writer, vmgenid)?; // End Header node. @@ -411,25 +412,21 @@ fn create_rtc_node(fdt: &mut FdtWriter, dev_info: &MMIODeviceInfo) -> Result<(), fn create_devices_node( fdt: &mut FdtWriter, - dev_info: &HashMap<(DeviceType, String), MMIODeviceInfo>, + mut virtio_devices: Vec<&MMIODeviceInfo>, + rtc: Option<&MMIODeviceInfo>, + serial: Option<&MMIODeviceInfo>, ) -> Result<(), FdtError> { - // Create one temp Vec to store all virtio devices - let mut ordered_virtio_device: Vec<&MMIODeviceInfo> = Vec::new(); - - for ((device_type, _device_id), info) in dev_info { - match device_type { - DeviceType::BootTimer => (), // since it's not a real device - DeviceType::Rtc => create_rtc_node(fdt, info)?, - DeviceType::Serial => create_serial_node(fdt, info)?, - DeviceType::Virtio(_) => { - ordered_virtio_device.push(info); - } - } + if let Some(device_info) = rtc { + create_rtc_node(fdt, device_info)?; + } + + if let Some(device_info) = serial { + create_serial_node(fdt, device_info)?; } // Sort out virtio devices by address from low to high and insert them into fdt table. - ordered_virtio_device.sort_by_key(|a| a.addr); - for ordered_device_info in ordered_virtio_device.drain(..) { + virtio_devices.sort_by_key(|a| a.addr); + for ordered_device_info in virtio_devices.drain(..) { create_virtio_node(fdt, ordered_device_info)?; } @@ -465,35 +462,22 @@ mod tests { fn test_create_fdt_with_devices() { let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); - let dev_info: HashMap<(DeviceType, std::string::String), MMIODeviceInfo> = [ - ( - (DeviceType::Serial, DeviceType::Serial.to_string()), - MMIODeviceInfo { - addr: 0x00, - irq: NonZeroU32::new(1), - len: LEN, - }, - ), - ( - (DeviceType::Virtio(1), "virtio".to_string()), - MMIODeviceInfo { - addr: LEN, - irq: NonZeroU32::new(2), - len: LEN, - }, - ), - ( - (DeviceType::Rtc, "rtc".to_string()), - MMIODeviceInfo { - addr: 2 * LEN, - irq: NonZeroU32::new(3), - len: LEN, - }, - ), - ] - .iter() - .cloned() - .collect(); + let serial = MMIODeviceInfo { + addr: 0x00, + irq: NonZeroU32::new(1), + len: LEN, + }; + let virtio_device = MMIODeviceInfo { + addr: LEN, + irq: NonZeroU32::new(2), + len: LEN, + }; + let rtc = MMIODeviceInfo { + addr: 2 * LEN, + irq: NonZeroU32::new(3), + len: LEN, + }; + let kvm = Kvm::new().unwrap(); let vm = kvm.create_vm().unwrap(); let gic = create_gic(&vm, 1, None).unwrap(); @@ -501,7 +485,9 @@ mod tests { &mem, vec![0], CString::new("console=tty0").unwrap(), - &dev_info, + vec![&virtio_device], + Some(&rtc), + Some(&serial), &gic, &None, &None, @@ -521,7 +507,9 @@ mod tests { &mem, vec![0], CString::new("console=tty0").unwrap(), - &HashMap::<(DeviceType, std::string::String), MMIODeviceInfo>::new(), + Vec::new(), + None, + None, &gic, &Some(vmgenid), &None, @@ -546,7 +534,9 @@ mod tests { &mem, vec![0], CString::new("console=tty0").unwrap(), - &HashMap::<(DeviceType, std::string::String), MMIODeviceInfo>::new(), + Vec::new(), + None, + None, &gic, &None, &None, @@ -608,7 +598,9 @@ mod tests { &mem, vec![0], CString::new("console=tty0").unwrap(), - &HashMap::<(DeviceType, std::string::String), MMIODeviceInfo>::new(), + vec![], + None, + None, &gic, &None, &Some(initrd), diff --git a/src/vmm/src/arch/aarch64/mod.rs b/src/vmm/src/arch/aarch64/mod.rs index ead827c08c4..f945601c940 100644 --- a/src/vmm/src/arch/aarch64/mod.rs +++ b/src/vmm/src/arch/aarch64/mod.rs @@ -134,7 +134,9 @@ pub fn configure_system_for_boot( vmm.vm.guest_memory(), vcpu_mpidr, cmdline, - vmm.mmio_device_manager.get_device_info(), + vmm.mmio_device_manager.virtio_device_info(), + vmm.mmio_device_manager.rtc_device_info(), + vmm.mmio_device_manager.serial_device_info(), vmm.vm.get_irqchip(), &vmm.acpi_device_manager.vmgenid, initrd, diff --git a/src/vmm/src/arch/aarch64/vcpu.rs b/src/vmm/src/arch/aarch64/vcpu.rs index 59c00c3ff86..005beb47ec4 100644 --- a/src/vmm/src/arch/aarch64/vcpu.rs +++ b/src/vmm/src/arch/aarch64/vcpu.rs @@ -7,6 +7,7 @@ use std::fmt::{Debug, Write}; use std::mem::offset_of; +use std::sync::Arc; use kvm_bindings::*; use kvm_ioctls::{VcpuExit, VcpuFd, VmFd}; @@ -126,7 +127,7 @@ pub struct KvmVcpu { #[derive(Default, Debug)] pub struct Peripherals { /// mmio bus. - pub mmio_bus: Option, + pub mmio_bus: Option>, } impl KvmVcpu { diff --git a/src/vmm/src/arch/x86_64/vcpu.rs b/src/vmm/src/arch/x86_64/vcpu.rs index b46d8e07b59..4eb609aadd6 100644 --- a/src/vmm/src/arch/x86_64/vcpu.rs +++ b/src/vmm/src/arch/x86_64/vcpu.rs @@ -7,6 +7,7 @@ use std::collections::BTreeMap; use std::fmt::Debug; +use std::sync::Arc; use kvm_bindings::{ CpuId, KVM_MAX_CPUID_ENTRIES, KVM_MAX_MSR_ENTRIES, Msrs, Xsave, kvm_debugregs, kvm_lapic_state, @@ -161,7 +162,7 @@ pub struct Peripherals { /// Pio bus. pub pio_bus: Option, /// Mmio bus. - pub mmio_bus: Option, + pub mmio_bus: Option>, } impl KvmVcpu { diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index db4f11b7a26..ac527b16747 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -615,7 +615,11 @@ fn attach_legacy_devices_aarch64( if cmdline_contains_console { // Make stdout non-blocking. set_stdout_nonblocking(); - let serial = setup_serial_device(event_manager)?; + let serial = Arc::new(Mutex::new( + SerialDevice::new(Some(std::io::stdin()), SerialOut::Stdout(std::io::stdout())) + .map_err(VmmError::EventFd)?, + )); + event_manager.add_subscriber(serial.clone()); vmm.mmio_device_manager .register_mmio_serial(vmm.vm.fd(), &mut vmm.resource_allocator, serial, None) .map_err(VmmError::RegisterMMIODevice)?; @@ -800,7 +804,6 @@ pub(crate) mod tests { use vmm_sys_util::tempfile::TempFile; use super::*; - use crate::arch::DeviceType; use crate::device_manager::resources::ResourceAllocator; #[cfg(target_arch = "x86_64")] use crate::devices::legacy::serial::SerialOut; @@ -999,7 +1002,7 @@ pub(crate) mod tests { assert!( vmm.mmio_device_manager - .get_device(DeviceType::Virtio(TYPE_VSOCK), &vsock_dev_id) + .get_virtio_device(TYPE_VSOCK, &vsock_dev_id) .is_some() ); } @@ -1017,7 +1020,7 @@ pub(crate) mod tests { assert!( vmm.mmio_device_manager - .get_device(DeviceType::Virtio(TYPE_RNG), ENTROPY_DEV_ID) + .get_virtio_device(TYPE_RNG, ENTROPY_DEV_ID) .is_some() ); } @@ -1042,7 +1045,7 @@ pub(crate) mod tests { assert!( vmm.mmio_device_manager - .get_device(DeviceType::Virtio(TYPE_BALLOON), BALLOON_DEV_ID) + .get_virtio_device(TYPE_BALLOON, BALLOON_DEV_ID) .is_some() ); } @@ -1093,7 +1096,7 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=/dev/vda ro")); assert!( vmm.mmio_device_manager - .get_device(DeviceType::Virtio(TYPE_BLOCK), drive_id.as_str()) + .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); } @@ -1114,7 +1117,7 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=PARTUUID=0eaa91a0-01 rw")); assert!( vmm.mmio_device_manager - .get_device(DeviceType::Virtio(TYPE_BLOCK), drive_id.as_str()) + .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); } @@ -1136,7 +1139,7 @@ pub(crate) mod tests { assert!(!cmdline_contains(&cmdline, "root=/dev/vda")); assert!( vmm.mmio_device_manager - .get_device(DeviceType::Virtio(TYPE_BLOCK), drive_id.as_str()) + .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); } @@ -1173,17 +1176,17 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=PARTUUID=0eaa91a0-01 rw")); assert!( vmm.mmio_device_manager - .get_device(DeviceType::Virtio(TYPE_BLOCK), "root") + .get_virtio_device(TYPE_BLOCK, "root") .is_some() ); assert!( vmm.mmio_device_manager - .get_device(DeviceType::Virtio(TYPE_BLOCK), "secondary") + .get_virtio_device(TYPE_BLOCK, "secondary") .is_some() ); assert!( vmm.mmio_device_manager - .get_device(DeviceType::Virtio(TYPE_BLOCK), "third") + .get_virtio_device(TYPE_BLOCK, "third") .is_some() ); @@ -1212,7 +1215,7 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=/dev/vda rw")); assert!( vmm.mmio_device_manager - .get_device(DeviceType::Virtio(TYPE_BLOCK), drive_id.as_str()) + .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); } @@ -1233,7 +1236,7 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=PARTUUID=0eaa91a0-01 ro")); assert!( vmm.mmio_device_manager - .get_device(DeviceType::Virtio(TYPE_BLOCK), drive_id.as_str()) + .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); } @@ -1254,7 +1257,7 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=/dev/vda rw")); assert!( vmm.mmio_device_manager - .get_device(DeviceType::Virtio(TYPE_BLOCK), drive_id.as_str()) + .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); } @@ -1267,11 +1270,7 @@ pub(crate) mod tests { let res = attach_boot_timer_device(&mut vmm, request_ts); res.unwrap(); - assert!( - vmm.mmio_device_manager - .get_device(DeviceType::BootTimer, &DeviceType::BootTimer.to_string()) - .is_some() - ); + assert!(vmm.mmio_device_manager.boot_timer.is_some()); } #[test] diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 2c7bdb0d679..add6e200954 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -21,11 +21,8 @@ use serde::{Deserialize, Serialize}; use vm_allocator::AllocPolicy; use super::resources::ResourceAllocator; -use crate::arch::DeviceType; -use crate::arch::DeviceType::Virtio; -use crate::devices::BusDevice; #[cfg(target_arch = "aarch64")] -use crate::devices::legacy::RTCDevice; +use crate::devices::legacy::{RTCDevice, SerialDevice}; use crate::devices::pseudo::BootTimer; use crate::devices::virtio::balloon::Balloon; use crate::devices::virtio::block::device::Block; @@ -44,9 +41,9 @@ pub enum MmioError { /// Failed to allocate requested resource: {0} Allocator(#[from] vm_allocator::Error), /// Failed to insert device on the bus: {0} - BusInsert(crate::devices::BusError), + BusInsert(#[from] vm_device::BusError), /// Failed to allocate requested resourc: {0} - Cmdline(linux_loader::cmdline::Error), + Cmdline(#[from] linux_loader::cmdline::Error), /// Failed to find the device on the bus. DeviceNotFound, /// Invalid device type found on the MMIO bus. @@ -74,7 +71,7 @@ pub enum MmioError { pub const MMIO_LEN: u64 = 0x1000; /// Stores the address range and irq allocated to this device. -#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct MMIODeviceInfo { /// Mmio address at which the device is registered. pub addr: u64, @@ -118,11 +115,30 @@ fn add_virtio_aml( .append_aml_bytes(dsdt_data) } +#[derive(Debug, Clone)] +/// A descriptor for MMIO devices +pub struct MMIODevice { + /// MMIO resources allocated to the device + pub(crate) resources: MMIODeviceInfo, + /// The actual device + pub(crate) inner: Arc>, +} + /// Manages the complexities of registering a MMIO device. #[derive(Debug)] pub struct MMIODeviceManager { - pub(crate) bus: crate::devices::Bus, - pub(crate) id_to_dev_info: HashMap<(DeviceType, String), MMIODeviceInfo>, + pub(crate) bus: Arc, + /// VirtIO devices using an MMIO transport layer + pub(crate) virtio_devices: HashMap<(u32, String), MMIODevice>, + /// Boot timer device + pub(crate) boot_timer: Option>, + #[cfg(target_arch = "aarch64")] + /// Real-Time clock on Aarch64 platforms + pub(crate) rtc: Option>, + #[cfg(target_arch = "aarch64")] + /// Serial device on Aarch64 platforms + pub(crate) serial: Option>, + #[cfg(target_arch = "x86_64")] // We create the AML byte code for every VirtIO device in the order we build // it, so that we ensure the root block device is appears first in the DSDT. // This is needed, so that the root device appears as `/dev/vda` in the guest @@ -130,7 +146,6 @@ pub struct MMIODeviceManager { // The alternative would be that we iterate the bus to get the data after all // of the devices are build. However, iterating the bus won't give us the // devices in the order they were added. - #[cfg(target_arch = "x86_64")] pub(crate) dsdt_data: Vec, } @@ -138,8 +153,13 @@ impl MMIODeviceManager { /// Create a new DeviceManager handling mmio devices (virtio net, block). pub fn new() -> MMIODeviceManager { MMIODeviceManager { - bus: crate::devices::Bus::new(), - id_to_dev_info: HashMap::new(), + bus: Arc::new(vm_device::Bus::new()), + virtio_devices: HashMap::new(), + boot_timer: None, + #[cfg(target_arch = "aarch64")] + rtc: None, + #[cfg(target_arch = "aarch64")] + serial: None, #[cfg(target_arch = "x86_64")] dsdt_data: vec![], } @@ -169,20 +189,6 @@ impl MMIODeviceManager { Ok(device_info) } - /// Register a device at some MMIO address. - fn register_mmio_device( - &mut self, - identifier: (DeviceType, String), - device_info: MMIODeviceInfo, - device: Arc>, - ) -> Result<(), MmioError> { - self.bus - .insert(device, device_info.addr, device_info.len) - .map_err(MmioError::BusInsert)?; - self.id_to_dev_info.insert(identifier, device_info); - Ok(()) - } - /// Register a virtio-over-MMIO device to be used via MMIO transport at a specific slot. pub fn register_mmio_virtio( &mut self, @@ -199,7 +205,7 @@ impl MMIODeviceManager { let identifier; { let locked_device = mmio_device.locked_device(); - identifier = (DeviceType::Virtio(locked_device.device_type()), device_id); + identifier = (locked_device.device_type(), device_id); for (i, queue_evt) in locked_device.queue_events().iter().enumerate() { let io_addr = IoEventAddress::Mmio( device_info.addr + u64::from(crate::devices::virtio::NOTIFY_REG_OFFSET), @@ -211,11 +217,18 @@ impl MMIODeviceManager { .map_err(MmioError::RegisterIrqFd)?; } - self.register_mmio_device( + let device = Arc::new(Mutex::new(mmio_device)); + self.bus + .insert(device.clone(), device_info.addr, device_info.len)?; + self.virtio_devices.insert( identifier, - device_info.clone(), - Arc::new(Mutex::new(BusDevice::MmioTransport(mmio_device))), - ) + MMIODevice { + resources: *device_info, + inner: device, + }, + ); + + Ok(()) } /// Append a registered virtio-over-MMIO device to the kernel cmdline. @@ -273,7 +286,7 @@ impl MMIODeviceManager { &mut self, vm: &VmFd, resource_allocator: &mut ResourceAllocator, - serial: Arc>, + serial: Arc>, device_info_opt: Option, ) -> Result<(), MmioError> { // Create a new MMIODeviceInfo object on boot path or unwrap the @@ -285,20 +298,18 @@ impl MMIODeviceManager { }; vm.register_irqfd( - serial - .lock() - .expect("Poisoned lock") - .serial_ref() - .unwrap() - .serial - .interrupt_evt(), + serial.lock().expect("Poisoned lock").serial.interrupt_evt(), device_info.irq.unwrap().get(), ) .map_err(MmioError::RegisterIrqFd)?; - let identifier = (DeviceType::Serial, DeviceType::Serial.to_string()); - // Register the newly created Serial object. - self.register_mmio_device(identifier, device_info, serial) + self.bus + .insert(serial.clone(), device_info.addr, device_info.len)?; + self.serial = Some(MMIODevice { + resources: device_info, + inner: serial, + }); + Ok(()) } #[cfg(target_arch = "aarch64")] @@ -307,13 +318,16 @@ impl MMIODeviceManager { &self, cmdline: &mut kernel_cmdline::Cmdline, ) -> Result<(), MmioError> { - let device_info = self - .id_to_dev_info - .get(&(DeviceType::Serial, DeviceType::Serial.to_string())) - .ok_or(MmioError::DeviceNotFound)?; - cmdline - .insert("earlycon", &format!("uart,mmio,0x{:08x}", device_info.addr)) - .map_err(MmioError::Cmdline) + match &self.serial { + Some(device) => { + cmdline.insert( + "earlycon", + &format!("uart,mmio,0x{:08x}", device.resources.addr), + )?; + Ok(()) + } + None => Err(MmioError::DeviceNotFound), + } } #[cfg(target_arch = "aarch64")] @@ -325,6 +339,7 @@ impl MMIODeviceManager { rtc: RTCDevice, device_info_opt: Option, ) -> Result<(), MmioError> { + let device = Arc::new(Mutex::new(rtc)); // Create a new MMIODeviceInfo object on boot path or unwrap the // existing object on restore path. let device_info = if let Some(device_info) = device_info_opt { @@ -333,88 +348,53 @@ impl MMIODeviceManager { self.allocate_mmio_resources(resource_allocator, 1)? }; - // Create a new identifier for the RTC device. - let identifier = (DeviceType::Rtc, DeviceType::Rtc.to_string()); - // Attach the newly created RTC device. - self.register_mmio_device( - identifier, - device_info, - Arc::new(Mutex::new(BusDevice::RTCDevice(rtc))), - ) + self.bus + .insert(device.clone(), device_info.addr, device_info.len)?; + self.rtc = Some(MMIODevice { + resources: device_info, + inner: device, + }); + + Ok(()) } /// Register a boot timer device. pub fn register_mmio_boot_timer( &mut self, resource_allocator: &mut ResourceAllocator, - device: BootTimer, + boot_timer: BootTimer, ) -> Result<(), MmioError> { // Attach a new boot timer device. let device_info = self.allocate_mmio_resources(resource_allocator, 0)?; - let identifier = (DeviceType::BootTimer, DeviceType::BootTimer.to_string()); - self.register_mmio_device( - identifier, - device_info, - Arc::new(Mutex::new(BusDevice::BootTimer(device))), - ) - } - - /// Gets the information of the devices registered up to some point in time. - pub fn get_device_info(&self) -> &HashMap<(DeviceType, String), MMIODeviceInfo> { - &self.id_to_dev_info + let device = Arc::new(Mutex::new(boot_timer)); + self.bus + .insert(device.clone(), device_info.addr, device_info.len)?; + self.boot_timer = Some(MMIODevice { + resources: device_info, + inner: device, + }); + Ok(()) } /// Gets the specified device. - pub fn get_device( + pub fn get_virtio_device( &self, - device_type: DeviceType, + virtio_type: u32, device_id: &str, - ) -> Option<&Mutex> { - if let Some(device_info) = self - .id_to_dev_info - .get(&(device_type, device_id.to_string())) - { - if let Some((_, device)) = self.bus.get_device(device_info.addr) { - return Some(device); - } - } - None - } - - /// Run fn for each registered device. - pub fn for_each_device(&self, mut f: F) -> Result<(), E> - where - F: FnMut(&DeviceType, &String, &MMIODeviceInfo, &Mutex) -> Result<(), E>, - { - for ((device_type, device_id), device_info) in self.get_device_info().iter() { - let bus_device = self - .get_device(*device_type, device_id) - // Safe to unwrap() because we know the device exists. - .unwrap(); - f(device_type, device_id, device_info, bus_device)?; - } - Ok(()) + ) -> Option<&MMIODevice> { + self.virtio_devices + .get(&(virtio_type, device_id.to_string())) } /// Run fn for each registered virtio device. pub fn for_each_virtio_device(&self, mut f: F) -> Result<(), E> where - F: FnMut(u32, &String, &MMIODeviceInfo, Arc>) -> Result<(), E>, + F: FnMut(&u32, &String, &MMIODevice) -> Result<(), E>, { - self.for_each_device(|device_type, device_id, device_info, bus_device| { - if let Virtio(virtio_type) = device_type { - let virtio_device = bus_device - .lock() - .expect("Poisoned lock") - .mmio_transport_ref() - .expect("Unexpected device type") - .device(); - f(*virtio_type, device_id, device_info, virtio_device)?; - } - Ok(()) - })?; - + for ((virtio_type, device_id), mmio_device) in &self.virtio_devices { + f(virtio_type, device_id, mmio_device)?; + } Ok(()) } @@ -429,13 +409,8 @@ impl MMIODeviceManager { T: VirtioDevice + 'static + Debug, F: FnOnce(&mut T) -> Result<(), String>, { - if let Some(busdev) = self.get_device(DeviceType::Virtio(virtio_type), id) { - let virtio_device = busdev - .lock() - .expect("Poisoned lock") - .mmio_transport_ref() - .expect("Unexpected device type") - .device(); + if let Some(device) = self.get_virtio_device(virtio_type, id) { + let virtio_device = device.inner.lock().expect("Poisoned lock").device(); let mut dev = virtio_device.lock().expect("Poisoned lock"); f(dev .as_mut_any() @@ -452,73 +427,92 @@ impl MMIODeviceManager { pub fn kick_devices(&self) { info!("Artificially kick devices."); // We only kick virtio devices for now. - let _: Result<(), MmioError> = - self.for_each_virtio_device(|virtio_type, id, _info, dev| { - let mut virtio = dev.lock().expect("Poisoned lock"); - match virtio_type { - TYPE_BALLOON => { - let balloon = virtio.as_mut_any().downcast_mut::().unwrap(); - // If device is activated, kick the balloon queue(s) to make up for any - // pending or in-flight epoll events we may have not captured in snapshot. - // Stats queue doesn't need kicking as it is notified via a `timer_fd`. - if balloon.is_activated() { - info!("kick balloon {}.", id); - balloon.process_virtio_queues().unwrap(); - } - } - TYPE_BLOCK => { - // We only care about kicking virtio block. - // If we need to kick vhost-user-block we can do nothing. - if let Some(block) = virtio.as_mut_any().downcast_mut::() { - // If device is activated, kick the block queue(s) to make up for any - // pending or in-flight epoll events we may have not captured in - // snapshot. No need to kick Ratelimiters - // because they are restored 'unblocked' so - // any inflight `timer_fd` events can be safely discarded. - if block.is_activated() { - info!("kick block {}.", id); - block.process_virtio_queues().unwrap() - } - } + let _: Result<(), MmioError> = self.for_each_virtio_device(|virtio_type, id, device| { + let mmio_transport_locked = device.inner.lock().expect("Poisoned locked"); + let mut virtio = mmio_transport_locked.locked_device(); + match *virtio_type { + TYPE_BALLOON => { + let balloon = virtio.as_mut_any().downcast_mut::().unwrap(); + // If device is activated, kick the balloon queue(s) to make up for any + // pending or in-flight epoll events we may have not captured in snapshot. + // Stats queue doesn't need kicking as it is notified via a `timer_fd`. + if balloon.is_activated() { + info!("kick balloon {}.", id); + balloon.process_virtio_queues(); } - TYPE_NET => { - let net = virtio.as_mut_any().downcast_mut::().unwrap(); - // If device is activated, kick the net queue(s) to make up for any - // pending or in-flight epoll events we may have not captured in snapshot. - // No need to kick Ratelimiters because they are restored 'unblocked' so + } + TYPE_BLOCK => { + // We only care about kicking virtio block. + // If we need to kick vhost-user-block we can do nothing. + if let Some(block) = virtio.as_mut_any().downcast_mut::() { + // If device is activated, kick the block queue(s) to make up for any + // pending or in-flight epoll events we may have not captured in + // snapshot. No need to kick Ratelimiters + // because they are restored 'unblocked' so // any inflight `timer_fd` events can be safely discarded. - if net.is_activated() { - info!("kick net {}.", id); - net.process_virtio_queues().unwrap(); + if block.is_activated() { + info!("kick block {}.", id); + block.process_virtio_queues().unwrap(); } } - TYPE_VSOCK => { - // Vsock has complicated protocol that isn't resilient to any packet loss, - // so for Vsock we don't support connection persistence through snapshot. - // Any in-flight packets or events are simply lost. - // Vsock is restored 'empty'. - // The only reason we still `kick` it is to make guest process - // `TRANSPORT_RESET_EVENT` event we sent during snapshot creation. - let vsock = virtio - .as_mut_any() - .downcast_mut::>() - .unwrap(); - if vsock.is_activated() { - info!("kick vsock {id}."); - vsock.signal_used_queue(0).unwrap(); - } + } + TYPE_NET => { + let net = virtio.as_mut_any().downcast_mut::().unwrap(); + // If device is activated, kick the net queue(s) to make up for any + // pending or in-flight epoll events we may have not captured in snapshot. + // No need to kick Ratelimiters because they are restored 'unblocked' so + // any inflight `timer_fd` events can be safely discarded. + if net.is_activated() { + info!("kick net {}.", id); + net.process_virtio_queues().unwrap(); } - TYPE_RNG => { - let entropy = virtio.as_mut_any().downcast_mut::().unwrap(); - if entropy.is_activated() { - info!("kick entropy {id}."); - entropy.process_virtio_queues().unwrap(); - } + } + TYPE_VSOCK => { + // Vsock has complicated protocol that isn't resilient to any packet loss, + // so for Vsock we don't support connection persistence through snapshot. + // Any in-flight packets or events are simply lost. + // Vsock is restored 'empty'. + // The only reason we still `kick` it is to make guest process + // `TRANSPORT_RESET_EVENT` event we sent during snapshot creation. + let vsock = virtio + .as_mut_any() + .downcast_mut::>() + .unwrap(); + if vsock.is_activated() { + info!("kick vsock {id}."); + vsock.signal_used_queue(0).unwrap(); } - _ => (), } - Ok(()) - }); + TYPE_RNG => { + let entropy = virtio.as_mut_any().downcast_mut::().unwrap(); + if entropy.is_activated() { + info!("kick entropy {id}."); + entropy.process_virtio_queues().unwrap(); + } + } + _ => (), + } + Ok(()) + }); + } + + #[cfg(target_arch = "aarch64")] + pub fn virtio_device_info(&self) -> Vec<&MMIODeviceInfo> { + let mut device_info = Vec::new(); + for (_, dev) in self.virtio_devices.iter() { + device_info.push(&dev.resources); + } + device_info + } + + #[cfg(target_arch = "aarch64")] + pub fn rtc_device_info(&self) -> Option<&MMIODeviceInfo> { + self.rtc.as_ref().map(|device| &device.resources) + } + + #[cfg(target_arch = "aarch64")] + pub fn serial_device_info(&self) -> Option<&MMIODeviceInfo> { + self.serial.as_ref().map(|device| &device.resources) } } @@ -531,7 +525,6 @@ mod tests { use vmm_sys_util::eventfd::EventFd; use super::*; - use crate::Vm; use crate::devices::virtio::ActivateError; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::queue::Queue; @@ -540,6 +533,7 @@ mod tests { use crate::test_utils::multi_region_mem_raw; use crate::vstate::kvm::Kvm; use crate::vstate::memory::{GuestAddress, GuestMemoryMmap}; + use crate::{Vm, arch}; const QUEUE_SIZES: &[u16] = &[64]; @@ -568,9 +562,9 @@ mod tests { #[cfg(target_arch = "x86_64")] /// Gets the number of interrupts used by the devices registered. pub fn used_irqs_count(&self) -> usize { - self.get_device_info() + self.virtio_devices .iter() - .filter(|(_, device_info)| device_info.irq.is_some()) + .filter(|(_, mmio_dev)| mmio_dev.resources.irq.is_some()) .count() } } @@ -683,6 +677,29 @@ mod tests { "dummy", ) .unwrap(); + + assert!(device_manager.get_virtio_device(0, "foo").is_none()); + let dev = device_manager.get_virtio_device(0, "dummy").unwrap(); + assert_eq!(dev.resources.addr, arch::MMIO_MEM_START); + assert_eq!(dev.resources.len, MMIO_LEN); + assert_eq!( + dev.resources.irq, + Some(NonZeroU32::try_from(arch::IRQ_BASE).unwrap()) + ); + + device_manager + .for_each_virtio_device(|virtio_type, device_id, mmio_device| { + assert_eq!(*virtio_type, 0); + assert_eq!(device_id, "dummy"); + assert_eq!(mmio_device.resources.addr, arch::MMIO_MEM_START); + assert_eq!(mmio_device.resources.len, MMIO_LEN); + assert_eq!( + mmio_device.resources.irq, + Some(NonZeroU32::try_from(arch::IRQ_BASE).unwrap()) + ); + Ok::<(), ()>(()) + }) + .unwrap(); } #[test] @@ -773,29 +790,24 @@ mod tests { &id, ) .unwrap(); - assert!( - device_manager - .get_device(DeviceType::Virtio(type_id), &id) - .is_some() - ); + assert!(device_manager.get_virtio_device(type_id, &id).is_some()); assert_eq!( addr, - device_manager.id_to_dev_info[&(DeviceType::Virtio(type_id), id.clone())].addr + device_manager.virtio_devices[&(type_id, id.clone())] + .resources + .addr ); assert_eq!( crate::arch::IRQ_BASE, - device_manager.id_to_dev_info[&(DeviceType::Virtio(type_id), id)] + device_manager.virtio_devices[&(type_id, id)] + .resources .irq .unwrap() .get() ); let id = "bar"; - assert!( - device_manager - .get_device(DeviceType::Virtio(type_id), id) - .is_none() - ); + assert!(device_manager.get_virtio_device(type_id, id).is_none()); let dummy2 = Arc::new(Mutex::new(DummyDevice::new())); let id2 = String::from("foo2"); @@ -811,15 +823,16 @@ mod tests { .unwrap(); let mut count = 0; - let _: Result<(), MmioError> = device_manager.for_each_device(|devtype, devid, _, _| { - assert_eq!(*devtype, DeviceType::Virtio(type_id)); - match devid.as_str() { - "foo" => count += 1, - "foo2" => count += 2, - _ => unreachable!(), - }; - Ok(()) - }); + let _: Result<(), MmioError> = + device_manager.for_each_virtio_device(|devtype, devid, _| { + assert_eq!(*devtype, type_id); + match devid.as_str() { + "foo" => count += 1, + "foo2" => count += 2, + _ => unreachable!(), + }; + Ok(()) + }); assert_eq!(count, 3); #[cfg(target_arch = "x86_64")] assert_eq!(device_manager.used_irqs_count(), 2); diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 34f0bbc5530..9d06ce78350 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -19,6 +19,10 @@ use crate::EventManager; #[cfg(target_arch = "aarch64")] use crate::arch::DeviceType; use crate::devices::acpi::vmgenid::{VMGenIDState, VMGenIdConstructorArgs, VmGenId, VmGenIdError}; +#[cfg(target_arch = "aarch64")] +use crate::devices::legacy::serial::SerialOut; +#[cfg(target_arch = "aarch64")] +use crate::devices::legacy::{RTCDevice, SerialDevice}; use crate::devices::virtio::balloon::persist::{BalloonConstructorArgs, BalloonState}; use crate::devices::virtio::balloon::{Balloon, BalloonError}; use crate::devices::virtio::block::BlockError; @@ -61,7 +65,7 @@ pub enum DevicePersistError { MmioTransport, #[cfg(target_arch = "aarch64")] /// Legacy: {0} - Legacy(#[from] crate::VmmError), + Legacy(#[from] std::io::Error), /// Net: {0} Net(#[from] NetError), /// Vsock: {0} @@ -285,32 +289,29 @@ impl<'a> Persist<'a> for MMIODeviceManager { fn save(&self) -> Self::State { let mut states = DeviceStates::default(); - let _: Result<(), ()> = self.for_each_device(|devtype, devid, device_info, bus_dev| { - if *devtype == crate::arch::DeviceType::BootTimer { - // No need to save BootTimer state. - return Ok(()); - } - #[cfg(target_arch = "aarch64")] - { - if *devtype == DeviceType::Serial || *devtype == DeviceType::Rtc { - states.legacy_devices.push(ConnectedLegacyState { - type_: *devtype, - device_info: device_info.clone(), - }); - return Ok(()); - } + #[cfg(target_arch = "aarch64")] + { + if let Some(device) = &self.serial { + states.legacy_devices.push(ConnectedLegacyState { + type_: DeviceType::Serial, + device_info: device.resources, + }); } - let locked_bus_dev = bus_dev.lock().expect("Poisoned lock"); - - let mmio_transport = locked_bus_dev - .mmio_transport_ref() - .expect("Unexpected device type"); + if let Some(device) = &self.rtc { + states.legacy_devices.push(ConnectedLegacyState { + type_: DeviceType::Rtc, + device_info: device.resources, + }); + } + } - let transport_state = mmio_transport.save(); + let _: Result<(), ()> = self.for_each_virtio_device(|_, devid, device| { + let mmio_transport_locked = device.inner.lock().expect("Poisoned lock"); + let transport_state = mmio_transport_locked.save(); - let mut locked_device = mmio_transport.locked_device(); + let mut locked_device = mmio_transport_locked.locked_device(); match locked_device.device_type() { TYPE_BALLOON => { let balloon_state = locked_device @@ -322,7 +323,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { device_id: devid.clone(), device_state: balloon_state, transport_state, - device_info: device_info.clone(), + device_info: device.resources, }); } // Both virtio-block and vhost-user-block share same device type. @@ -339,7 +340,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { device_id: devid.clone(), device_state: block.save(), transport_state, - device_info: device_info.clone(), + device_info: device.resources, }) } } @@ -356,7 +357,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { device_id: devid.clone(), device_state: net.save(), transport_state, - device_info: device_info.clone(), + device_info: device.resources, }); } TYPE_VSOCK => { @@ -385,7 +386,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { device_id: devid.clone(), device_state: vsock_state, transport_state, - device_info: device_info.clone(), + device_info: device.resources, }); } TYPE_RNG => { @@ -398,7 +399,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { device_id: devid.clone(), device_state: entropy.save(), transport_state, - device_info: device_info.clone(), + device_info: device.resources, }); } _ => unreachable!(), @@ -421,8 +422,13 @@ impl<'a> Persist<'a> for MMIODeviceManager { { for state in &state.legacy_devices { if state.type_ == DeviceType::Serial { - let serial = - crate::builder::setup_serial_device(constructor_args.event_manager)?; + let serial = Arc::new(Mutex::new(SerialDevice::new( + Some(std::io::stdin()), + SerialOut::Stdout(std::io::stdout()), + )?)); + constructor_args + .event_manager + .add_subscriber(serial.clone()); constructor_args .resource_allocator @@ -439,11 +445,11 @@ impl<'a> Persist<'a> for MMIODeviceManager { vm, constructor_args.resource_allocator, serial, - Some(state.device_info.clone()), + Some(state.device_info), )?; } if state.type_ == DeviceType::Rtc { - let rtc = crate::devices::legacy::RTCDevice(vm_superio::Rtc::with_events( + let rtc = RTCDevice(vm_superio::Rtc::with_events( &crate::devices::legacy::rtc_pl031::METRICS, )); constructor_args @@ -459,7 +465,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { dev_manager.register_mmio_rtc( constructor_args.resource_allocator, rtc, - Some(state.device_info.clone()), + Some(state.device_info), )?; } } @@ -728,24 +734,32 @@ mod tests { // know will results in `Ok` let mut clone = MMIODeviceManager::new(); // We only care about the device hashmap. - clone.id_to_dev_info.clone_from(&self.id_to_dev_info); + clone.virtio_devices.clone_from(&self.virtio_devices); + clone.boot_timer = self.boot_timer.clone(); clone } } + impl PartialEq for MMIODevice { + fn eq(&self, other: &Self) -> bool { + self.resources == other.resources + } + } + impl PartialEq for MMIODeviceManager { fn eq(&self, other: &MMIODeviceManager) -> bool { // We only care about the device hashmap. - if self.id_to_dev_info.len() != other.id_to_dev_info.len() { + if self.virtio_devices.len() != other.virtio_devices.len() { return false; } - for (key, val) in &self.id_to_dev_info { - match other.id_to_dev_info.get(key) { + for (key, val) in &self.virtio_devices { + match other.virtio_devices.get(key) { Some(other_val) if val == other_val => continue, _ => return false, - }; + } } - true + + self.boot_timer == other.boot_timer } } diff --git a/src/vmm/src/devices/bus.rs b/src/vmm/src/devices/bus.rs index 516b40cc93f..6f7e1531bf3 100644 --- a/src/vmm/src/devices/bus.rs +++ b/src/vmm/src/devices/bus.rs @@ -9,6 +9,8 @@ use std::cmp::{Ord, Ordering, PartialEq, PartialOrd}; use std::collections::btree_map::BTreeMap; +#[cfg(test)] +use std::sync::Barrier; use std::sync::{Arc, Mutex}; /// Errors triggered during bus operations. @@ -55,20 +57,14 @@ use event_manager::{EventOps, Events, MutEventSubscriber}; #[cfg(target_arch = "aarch64")] use super::legacy::RTCDevice; use super::legacy::{I8042Device, SerialDevice}; -use super::pseudo::BootTimer; -use super::virtio::transport::mmio::MmioTransport; #[derive(Debug)] pub enum BusDevice { I8042Device(I8042Device), #[cfg(target_arch = "aarch64")] RTCDevice(RTCDevice), - BootTimer(BootTimer), - MmioTransport(MmioTransport), Serial(SerialDevice), #[cfg(test)] - Dummy(DummyDevice), - #[cfg(test)] Constant(ConstantDevice), } @@ -77,9 +73,11 @@ pub enum BusDevice { pub struct DummyDevice; #[cfg(test)] -impl DummyDevice { - pub fn bus_write(&mut self, _offset: u64, _data: &[u8]) {} - pub fn bus_read(&mut self, _offset: u64, _data: &[u8]) {} +impl vm_device::BusDevice for DummyDevice { + fn write(&mut self, _base: u64, _offset: u64, _data: &[u8]) -> Option> { + None + } + fn read(&mut self, _base: u64, _offset: u64, _data: &mut [u8]) {} } #[cfg(test)] @@ -115,18 +113,6 @@ impl BusDevice { _ => None, } } - pub fn boot_timer_ref(&self) -> Option<&BootTimer> { - match self { - Self::BootTimer(x) => Some(x), - _ => None, - } - } - pub fn mmio_transport_ref(&self) -> Option<&MmioTransport> { - match self { - Self::MmioTransport(x) => Some(x), - _ => None, - } - } pub fn serial_ref(&self) -> Option<&SerialDevice> { match self { Self::Serial(x) => Some(x), @@ -147,18 +133,6 @@ impl BusDevice { _ => None, } } - pub fn boot_timer_mut(&mut self) -> Option<&mut BootTimer> { - match self { - Self::BootTimer(x) => Some(x), - _ => None, - } - } - pub fn mmio_transport_mut(&mut self) -> Option<&mut MmioTransport> { - match self { - Self::MmioTransport(x) => Some(x), - _ => None, - } - } pub fn serial_mut(&mut self) -> Option<&mut SerialDevice> { match self { Self::Serial(x) => Some(x), @@ -171,11 +145,8 @@ impl BusDevice { Self::I8042Device(x) => x.bus_read(offset, data), #[cfg(target_arch = "aarch64")] Self::RTCDevice(x) => x.bus_read(offset, data), - Self::BootTimer(x) => x.bus_read(offset, data), - Self::MmioTransport(x) => x.bus_read(offset, data), Self::Serial(x) => x.bus_read(offset, data), #[cfg(test)] - Self::Dummy(x) => x.bus_read(offset, data), #[cfg(test)] Self::Constant(x) => x.bus_read(offset, data), } @@ -186,12 +157,8 @@ impl BusDevice { Self::I8042Device(x) => x.bus_write(offset, data), #[cfg(target_arch = "aarch64")] Self::RTCDevice(x) => x.bus_write(offset, data), - Self::BootTimer(x) => x.bus_write(offset, data), - Self::MmioTransport(x) => x.bus_write(offset, data), Self::Serial(x) => x.bus_write(offset, data), #[cfg(test)] - Self::Dummy(x) => x.bus_write(offset, data), - #[cfg(test)] Self::Constant(x) => x.bus_write(offset, data), } } @@ -314,7 +281,7 @@ mod tests { #[test] fn bus_insert() { let mut bus = Bus::new(); - let dummy = Arc::new(Mutex::new(BusDevice::Dummy(DummyDevice))); + let dummy = Arc::new(Mutex::new(BusDevice::Constant(ConstantDevice))); // Insert len should not be 0. bus.insert(dummy.clone(), 0x10, 0).unwrap_err(); bus.insert(dummy.clone(), 0x10, 0x10).unwrap(); @@ -338,23 +305,6 @@ mod tests { bus.insert(dummy, 0x0, 0x10).unwrap(); } - #[test] - fn bus_read_write() { - let mut bus = Bus::new(); - let dummy = Arc::new(Mutex::new(BusDevice::Dummy(DummyDevice))); - bus.insert(dummy, 0x10, 0x10).unwrap(); - assert!(bus.read(0x10, &mut [0, 0, 0, 0])); - assert!(bus.write(0x10, &[0, 0, 0, 0])); - assert!(bus.read(0x11, &mut [0, 0, 0, 0])); - assert!(bus.write(0x11, &[0, 0, 0, 0])); - assert!(bus.read(0x16, &mut [0, 0, 0, 0])); - assert!(bus.write(0x16, &[0, 0, 0, 0])); - assert!(!bus.read(0x20, &mut [0, 0, 0, 0])); - assert!(!bus.write(0x20, &[0, 0, 0, 0])); - assert!(!bus.read(0x06, &mut [0, 0, 0, 0])); - assert!(!bus.write(0x06, &[0, 0, 0, 0])); - } - #[test] fn bus_read_write_values() { let mut bus = Bus::new(); @@ -381,7 +331,7 @@ mod tests { let mut bus = Bus::new(); let mut data = [1, 2, 3, 4]; bus.insert( - Arc::new(Mutex::new(BusDevice::Dummy(DummyDevice))), + Arc::new(Mutex::new(BusDevice::Constant(ConstantDevice))), 0x10, 0x10, ) diff --git a/src/vmm/src/devices/legacy/rtc_pl031.rs b/src/vmm/src/devices/legacy/rtc_pl031.rs index 754899a23a4..b7ebc827e85 100644 --- a/src/vmm/src/devices/legacy/rtc_pl031.rs +++ b/src/vmm/src/devices/legacy/rtc_pl031.rs @@ -80,7 +80,7 @@ impl RTCDevice { pub fn bus_read(&mut self, offset: u64, data: &mut [u8]) { if let (Ok(offset), 4) = (u16::try_from(offset), data.len()) { // read() function from RTC implementation expects a slice of - // len 4, and we just validated that this is the data lengt + // len 4, and we just validated that this is the data length self.read(offset, data.try_into().unwrap()) } else { warn!( @@ -108,6 +108,23 @@ impl RTCDevice { } } +#[cfg(target_arch = "aarch64")] +impl vm_device::BusDevice for RTCDevice { + fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { + self.bus_read(offset, data) + } + + fn write( + &mut self, + _base: u64, + offset: u64, + data: &[u8], + ) -> Option> { + self.bus_write(offset, data); + None + } +} + #[cfg(test)] mod tests { use vm_superio::Rtc; diff --git a/src/vmm/src/devices/legacy/serial.rs b/src/vmm/src/devices/legacy/serial.rs index c73534e76c4..b895635e56b 100644 --- a/src/vmm/src/devices/legacy/serial.rs +++ b/src/vmm/src/devices/legacy/serial.rs @@ -382,6 +382,25 @@ impl } } +#[cfg(target_arch = "aarch64")] +impl vm_device::BusDevice + for SerialWrapper +{ + fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { + self.bus_read(offset, data) + } + + fn write( + &mut self, + _base: u64, + offset: u64, + data: &[u8], + ) -> Option> { + self.bus_write(offset, data); + None + } +} + #[cfg(test)] mod tests { #![allow(clippy::undocumented_unsafe_blocks)] diff --git a/src/vmm/src/devices/pseudo/boot_timer.rs b/src/vmm/src/devices/pseudo/boot_timer.rs index ba16e92355f..f0cf38977b5 100644 --- a/src/vmm/src/devices/pseudo/boot_timer.rs +++ b/src/vmm/src/devices/pseudo/boot_timer.rs @@ -1,6 +1,8 @@ // Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +use std::sync::{Arc, Barrier}; + use utils::time::TimestampUs; use crate::logger::info; @@ -8,16 +10,16 @@ use crate::logger::info; const MAGIC_VALUE_SIGNAL_GUEST_BOOT_COMPLETE: u8 = 123; /// Pseudo device to record the kernel boot time. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct BootTimer { start_ts: TimestampUs, } -impl BootTimer { - pub fn bus_write(&mut self, offset: u64, data: &[u8]) { +impl vm_device::BusDevice for BootTimer { + fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { // Only handle byte length instructions at a zero offset. if data.len() != 1 || offset != 0 { - return; + return None; } if data[0] == MAGIC_VALUE_SIGNAL_GUEST_BOOT_COMPLETE { @@ -33,8 +35,11 @@ impl BootTimer { boot_time_cpu_us / 1000 ); } + + None } - pub fn bus_read(&mut self, _offset: u64, _data: &[u8]) {} + + fn read(&mut self, _base: u64, _offset: u64, _data: &mut [u8]) {} } impl BootTimer { diff --git a/src/vmm/src/devices/virtio/transport/mmio.rs b/src/vmm/src/devices/virtio/transport/mmio.rs index 07cb03fbdbb..5ecc3fa8ffe 100644 --- a/src/vmm/src/devices/virtio/transport/mmio.rs +++ b/src/vmm/src/devices/virtio/transport/mmio.rs @@ -7,7 +7,7 @@ use std::fmt::Debug; use std::sync::atomic::{AtomicU32, Ordering}; -use std::sync::{Arc, Mutex, MutexGuard}; +use std::sync::{Arc, Barrier, Mutex, MutexGuard}; use vmm_sys_util::eventfd::EventFd; @@ -47,7 +47,7 @@ const MMIO_VERSION: u32 = 2; /// /// Typically one page (4096 bytes) of MMIO address space is sufficient to handle this transport /// and inner virtio device. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct MmioTransport { device: Arc>, // The register where feature bits are stored. @@ -232,8 +232,8 @@ impl MmioTransport { } } -impl MmioTransport { - pub fn bus_read(&mut self, offset: u64, data: &mut [u8]) { +impl vm_device::BusDevice for MmioTransport { + fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) { match offset { 0x00..=0xff if data.len() == 4 => { let v = match offset { @@ -287,12 +287,15 @@ impl MmioTransport { } 0x100..=0xfff => self.locked_device().read_config(offset - 0x100, data), _ => { - warn!("invalid virtio mmio read: {:#x}:{:#x}", offset, data.len()); + warn!( + "invalid virtio mmio read: {base:#x}:{offset:#x}:{:#x}", + data.len() + ); } }; } - pub fn bus_write(&mut self, offset: u64, data: &[u8]) { + fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option> { fn hi(v: &mut GuestAddress, x: u32) { *v = (*v & 0xffff_ffff) | (u64::from(x) << 32) } @@ -354,9 +357,13 @@ impl MmioTransport { } } _ => { - warn!("invalid virtio mmio write: {:#x}:{:#x}", offset, data.len()); + warn!( + "invalid virtio mmio write: {base:#x}:{offset:#x}:{:#x}", + data.len() + ); } } + None } } @@ -455,6 +462,7 @@ pub(crate) mod tests { use std::ops::Deref; + use vm_device::BusDevice; use vmm_sys_util::eventfd::EventFd; use super::*; @@ -567,7 +575,7 @@ pub(crate) mod tests { fn set_device_status(d: &mut MmioTransport, status: u32) { let mut buf = [0; 4]; write_le_u32(&mut buf[..], status); - d.bus_write(0x70, &buf[..]); + d.write(0x0, 0x70, &buf[..]); } #[test] @@ -615,7 +623,7 @@ pub(crate) mod tests { // The following read shouldn't be valid, because the length of the buf is not 4. buf.push(0); - d.bus_read(0, &mut buf[..]); + d.read(0x0, 0, &mut buf[..]); assert_eq!(buf[..4], buf_copy[..]); // the length is ok again @@ -623,74 +631,74 @@ pub(crate) mod tests { // Now we test that reading at various predefined offsets works as intended. - d.bus_read(0, &mut buf[..]); + d.read(0x0, 0, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), MMIO_MAGIC_VALUE); - d.bus_read(0x04, &mut buf[..]); + d.read(0x0, 0x04, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), MMIO_VERSION); - d.bus_read(0x08, &mut buf[..]); + d.read(0x0, 0x08, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), d.locked_device().device_type()); - d.bus_read(0x0c, &mut buf[..]); + d.read(0x0, 0x0c, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), VENDOR_ID); d.features_select = 0; - d.bus_read(0x10, &mut buf[..]); + d.read(0x0, 0x10, &mut buf[..]); assert_eq!( read_le_u32(&buf[..]), d.locked_device().avail_features_by_page(0) ); d.features_select = 1; - d.bus_read(0x10, &mut buf[..]); + d.read(0x0, 0x10, &mut buf[..]); assert_eq!( read_le_u32(&buf[..]), d.locked_device().avail_features_by_page(0) | 0x1 ); - d.bus_read(0x34, &mut buf[..]); + d.read(0x0, 0x34, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), 16); - d.bus_read(0x44, &mut buf[..]); + d.read(0x0, 0x44, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), u32::from(false)); d.interrupt.irq_status.store(111, Ordering::SeqCst); - d.bus_read(0x60, &mut buf[..]); + d.read(0x0, 0x60, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), 111); d.is_vhost_user = true; - d.interrupt.irq_status.store(0, Ordering::SeqCst); - d.bus_read(0x60, &mut buf[..]); + d.interrupt.status().store(0, Ordering::SeqCst); + d.read(0x0, 0x60, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), VIRTIO_MMIO_INT_VRING); d.is_vhost_user = true; d.interrupt .irq_status .store(VIRTIO_MMIO_INT_CONFIG, Ordering::SeqCst); - d.bus_read(0x60, &mut buf[..]); + d.read(0x0, 0x60, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), VIRTIO_MMIO_INT_CONFIG); - d.bus_read(0x70, &mut buf[..]); + d.read(0x0, 0x70, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), 0); d.config_generation = 5; - d.bus_read(0xfc, &mut buf[..]); + d.read(0x0, 0xfc, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), 5); // This read shouldn't do anything, as it's past the readable generic registers, and // before the device specific configuration space. Btw, reads from the device specific // conf space are going to be tested a bit later, alongside writes. buf = buf_copy.to_vec(); - d.bus_read(0xfd, &mut buf[..]); + d.read(0x0, 0xfd, &mut buf[..]); assert_eq!(buf[..], buf_copy[..]); // Read from an invalid address in generic register range. - d.bus_read(0xfb, &mut buf[..]); + d.read(0x0, 0xfb, &mut buf[..]); assert_eq!(buf[..], buf_copy[..]); // Read from an invalid length in generic register range. - d.bus_read(0xfc, &mut buf[..3]); + d.read(0x0, 0xfc, &mut buf[..3]); assert_eq!(buf[..], buf_copy[..]); } @@ -706,7 +714,7 @@ pub(crate) mod tests { // Nothing should happen, because the slice len > 4. d.features_select = 0; - d.bus_write(0x14, &buf[..]); + d.write(0x0, 0x14, &buf[..]); assert_eq!(d.features_select, 0); buf.pop(); @@ -718,7 +726,7 @@ pub(crate) mod tests { assert_eq!(d.locked_device().acked_features(), 0x0); d.acked_features_select = 0x0; write_le_u32(&mut buf[..], 1); - d.bus_write(0x20, &buf[..]); + d.write(0x0, 0x20, &buf[..]); assert_eq!(d.locked_device().acked_features(), 0x0); // Write to device specific configuration space should be ignored before setting @@ -727,8 +735,8 @@ pub(crate) mod tests { for i in (0..0xeff).rev() { let mut buf2 = vec![0; 0xeff]; - d.bus_write(0x100 + i as u64, &buf1[i..]); - d.bus_read(0x100, &mut buf2[..]); + d.write(0x0, 0x100 + i as u64, &buf1[i..]); + d.read(0x0, 0x100, &mut buf2[..]); for item in buf2.iter().take(0xeff) { assert_eq!(*item, 0); @@ -744,7 +752,7 @@ pub(crate) mod tests { // now writes should work d.features_select = 0; write_le_u32(&mut buf[..], 1); - d.bus_write(0x14, &buf[..]); + d.write(0x0, 0x14, &buf[..]); assert_eq!(d.features_select, 1); // Test acknowledging features on bus. @@ -753,12 +761,12 @@ pub(crate) mod tests { // Set the device available features in order to make acknowledging possible. dummy_dev.lock().unwrap().set_avail_features(0x124); - d.bus_write(0x20, &buf[..]); + d.write(0x0, 0x20, &buf[..]); assert_eq!(d.locked_device().acked_features(), 0x124); d.acked_features_select = 0; write_le_u32(&mut buf[..], 2); - d.bus_write(0x24, &buf[..]); + d.write(0x0, 0x24, &buf[..]); assert_eq!(d.acked_features_select, 2); set_device_status( &mut d, @@ -769,31 +777,31 @@ pub(crate) mod tests { assert_eq!(d.locked_device().acked_features(), 0x124); d.acked_features_select = 0x0; write_le_u32(&mut buf[..], 1); - d.bus_write(0x20, &buf[..]); + d.write(0x0, 0x20, &buf[..]); assert_eq!(d.locked_device().acked_features(), 0x124); // Setup queues d.queue_select = 0; write_le_u32(&mut buf[..], 3); - d.bus_write(0x30, &buf[..]); + d.write(0x0, 0x30, &buf[..]); assert_eq!(d.queue_select, 3); d.queue_select = 0; assert_eq!(d.locked_device().queues()[0].size, 0); write_le_u32(&mut buf[..], 16); - d.bus_write(0x38, &buf[..]); + d.write(0x0, 0x38, &buf[..]); assert_eq!(d.locked_device().queues()[0].size, 16); assert!(!d.locked_device().queues()[0].ready); write_le_u32(&mut buf[..], 1); - d.bus_write(0x44, &buf[..]); + d.write(0x0, 0x44, &buf[..]); assert!(d.locked_device().queues()[0].ready); assert_eq!(d.locked_device().queues()[0].desc_table_address.0, 0); write_le_u32(&mut buf[..], 123); - d.bus_write(0x80, &buf[..]); + d.write(0x0, 0x80, &buf[..]); assert_eq!(d.locked_device().queues()[0].desc_table_address.0, 123); - d.bus_write(0x84, &buf[..]); + d.write(0x0, 0x84, &buf[..]); assert_eq!( d.locked_device().queues()[0].desc_table_address.0, 123 + (123 << 32) @@ -801,9 +809,9 @@ pub(crate) mod tests { assert_eq!(d.locked_device().queues()[0].avail_ring_address.0, 0); write_le_u32(&mut buf[..], 124); - d.bus_write(0x90, &buf[..]); + d.write(0x0, 0x90, &buf[..]); assert_eq!(d.locked_device().queues()[0].avail_ring_address.0, 124); - d.bus_write(0x94, &buf[..]); + d.write(0x0, 0x94, &buf[..]); assert_eq!( d.locked_device().queues()[0].avail_ring_address.0, 124 + (124 << 32) @@ -811,9 +819,9 @@ pub(crate) mod tests { assert_eq!(d.locked_device().queues()[0].used_ring_address.0, 0); write_le_u32(&mut buf[..], 125); - d.bus_write(0xa0, &buf[..]); + d.write(0x0, 0xa0, &buf[..]); assert_eq!(d.locked_device().queues()[0].used_ring_address.0, 125); - d.bus_write(0xa4, &buf[..]); + d.write(0x0, 0xa4, &buf[..]); assert_eq!( d.locked_device().queues()[0].used_ring_address.0, 125 + (125 << 32) @@ -829,17 +837,17 @@ pub(crate) mod tests { d.interrupt.irq_status.store(0b10_1010, Ordering::Relaxed); write_le_u32(&mut buf[..], 0b111); - d.bus_write(0x64, &buf[..]); + d.write(0x0, 0x64, &buf[..]); assert_eq!(d.interrupt.irq_status.load(Ordering::Relaxed), 0b10_1000); // Write to an invalid address in generic register range. write_le_u32(&mut buf[..], 0xf); d.config_generation = 0; - d.bus_write(0xfb, &buf[..]); + d.write(0x0, 0xfb, &buf[..]); assert_eq!(d.config_generation, 0); // Write to an invalid length in generic register range. - d.bus_write(0xfc, &buf[..2]); + d.write(0x0, 0xfc, &buf[..2]); assert_eq!(d.config_generation, 0); // Here we test writes/read into/from the device specific configuration space. @@ -847,8 +855,8 @@ pub(crate) mod tests { for i in (0..0xeff).rev() { let mut buf2 = vec![0; 0xeff]; - d.bus_write(0x100 + i as u64, &buf1[i..]); - d.bus_read(0x100, &mut buf2[..]); + d.write(0x0, 0x100 + i as u64, &buf1[i..]); + d.read(0x0, 0x100, &mut buf2[..]); for item in buf2.iter().take(i) { assert_eq!(*item, 0); @@ -903,17 +911,17 @@ pub(crate) mod tests { for q in 0..queue_len { d.queue_select = q.try_into().unwrap(); write_le_u32(&mut buf[..], 16); - d.bus_write(0x38, &buf[..]); + d.write(0x0, 0x38, &buf[..]); write_le_u32(&mut buf[..], 1); - d.bus_write(0x44, &buf[..]); + d.write(0x0, 0x44, &buf[..]); } assert!(!d.locked_device().is_activated()); // Device should be ready for activation now. // A couple of invalid writes; will trigger warnings; shouldn't activate the device. - d.bus_write(0xa8, &buf[..]); - d.bus_write(0x1000, &buf[..]); + d.write(0x0, 0xa8, &buf[..]); + d.write(0x0, 0x1000, &buf[..]); assert!(!d.locked_device().is_activated()); set_device_status( @@ -936,8 +944,8 @@ pub(crate) mod tests { // a warning path and have no effect on queue state. write_le_u32(&mut buf[..], 0); d.queue_select = 0; - d.bus_write(0x44, &buf[..]); - d.bus_read(0x44, &mut buf[..]); + d.write(0x0, 0x44, &buf[..]); + d.read(0x0, 0x44, &mut buf[..]); assert_eq!(read_le_u32(&buf[..]), 1); } @@ -963,9 +971,9 @@ pub(crate) mod tests { for q in 0..queue_len { d.queue_select = q.try_into().unwrap(); write_le_u32(&mut buf[..], 16); - d.bus_write(0x38, &buf[..]); + d.write(0x0, 0x38, &buf[..]); write_le_u32(&mut buf[..], 1); - d.bus_write(0x44, &buf[..]); + d.write(0x0, 0x44, &buf[..]); } assert!(!d.locked_device().is_activated()); @@ -1010,9 +1018,9 @@ pub(crate) mod tests { for q in 0..queues_count { d.queue_select = q.try_into().unwrap(); write_le_u32(&mut buf[..], 16); - d.bus_write(0x38, &buf[..]); + d.write(0x0, 0x38, &buf[..]); write_le_u32(&mut buf[..], 1); - d.bus_write(0x44, &buf[..]); + d.write(0x0, 0x44, &buf[..]); } assert!(!d.locked_device().is_activated()); @@ -1052,13 +1060,13 @@ pub(crate) mod tests { // Marking device as FAILED should not affect device_activated state write_le_u32(&mut buf[..], 0x8f); - d.bus_write(0x70, &buf[..]); + d.write(0x0, 0x70, &buf[..]); assert_eq!(d.device_status, 0x8f); assert!(d.locked_device().is_activated()); // Nothing happens when backend driver doesn't support reset write_le_u32(&mut buf[..], 0x0); - d.bus_write(0x70, &buf[..]); + d.write(0x0, 0x70, &buf[..]); assert_eq!(d.device_status, 0x8f); assert!(d.locked_device().is_activated()); } diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 29f3b0148ac..91be64e1d67 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -124,6 +124,7 @@ use std::time::Duration; use device_manager::acpi::ACPIDeviceManager; use device_manager::resources::ResourceAllocator; use devices::acpi::vmgenid::VmGenIdError; +use devices::virtio::device::VirtioDevice; use event_manager::{EventManager as BaseEventManager, EventOps, Events, MutEventSubscriber}; use seccomp::BpfProgram; use userfaultfd::Uffd; @@ -133,7 +134,6 @@ use vmm_sys_util::terminal::Terminal; use vstate::kvm::Kvm; use vstate::vcpu::{self, StartThreadedError, VcpuSendEventError}; -use crate::arch::DeviceType; use crate::cpu_config::templates::CpuConfiguration; #[cfg(target_arch = "x86_64")] use crate::device_manager::legacy::PortIODeviceManager; @@ -340,12 +340,16 @@ impl Vmm { } /// Gets the specified bus device. - pub fn get_bus_device( + pub fn get_virtio_device( &self, - device_type: DeviceType, + device_type: u32, device_id: &str, - ) -> Option<&Mutex> { - self.mmio_device_manager.get_device(device_type, device_id) + ) -> Option>> { + let device = self + .mmio_device_manager + .get_virtio_device(device_type, device_id)?; + + Some(device.inner.lock().expect("Poisoned lock").device().clone()) } /// Starts the microVM vcpus. @@ -452,20 +456,14 @@ impl Vmm { #[cfg(target_arch = "aarch64")] { - let serial_bus_device = self.get_bus_device(DeviceType::Serial, "Serial"); - if serial_bus_device.is_none() { - return Ok(()); - } - let mut serial_device_locked = - serial_bus_device.unwrap().lock().expect("Poisoned lock"); - let serial = serial_device_locked - .serial_mut() - .expect("Unexpected BusDeviceType"); + if let Some(device) = &self.mmio_device_manager.serial { + let mut device_locked = device.inner.lock().expect("Poisoned lock"); - serial - .serial - .write(IER_RDA_OFFSET, IER_RDA_BIT) - .map_err(|_| EmulateSerialInitError(std::io::Error::last_os_error()))?; + device_locked + .serial + .write(IER_RDA_OFFSET, IER_RDA_BIT) + .map_err(|_| EmulateSerialInitError(std::io::Error::last_os_error()))?; + } Ok(()) } @@ -646,15 +644,7 @@ impl Vmm { /// Returns a reference to the balloon device if present. pub fn balloon_config(&self) -> Result { - if let Some(busdev) = self.get_bus_device(DeviceType::Virtio(TYPE_BALLOON), BALLOON_DEV_ID) - { - let virtio_device = busdev - .lock() - .expect("Poisoned lock") - .mmio_transport_ref() - .expect("Unexpected device type") - .device(); - + if let Some(virtio_device) = self.get_virtio_device(TYPE_BALLOON, BALLOON_DEV_ID) { let config = virtio_device .lock() .expect("Poisoned lock") @@ -671,15 +661,7 @@ impl Vmm { /// Returns the latest balloon statistics if they are enabled. pub fn latest_balloon_stats(&self) -> Result { - if let Some(busdev) = self.get_bus_device(DeviceType::Virtio(TYPE_BALLOON), BALLOON_DEV_ID) - { - let virtio_device = busdev - .lock() - .expect("Poisoned lock") - .mmio_transport_ref() - .expect("Unexpected device type") - .device(); - + if let Some(virtio_device) = self.get_virtio_device(TYPE_BALLOON, BALLOON_DEV_ID) { let latest_stats = virtio_device .lock() .expect("Poisoned lock") @@ -704,16 +686,8 @@ impl Vmm { return Err(BalloonError::TooManyPagesRequested); } - if let Some(busdev) = self.get_bus_device(DeviceType::Virtio(TYPE_BALLOON), BALLOON_DEV_ID) - { + if let Some(virtio_device) = self.get_virtio_device(TYPE_BALLOON, BALLOON_DEV_ID) { { - let virtio_device = busdev - .lock() - .expect("Poisoned lock") - .mmio_transport_ref() - .expect("Unexpected device type") - .device(); - virtio_device .lock() .expect("Poisoned lock") @@ -734,16 +708,8 @@ impl Vmm { &mut self, stats_polling_interval_s: u16, ) -> Result<(), BalloonError> { - if let Some(busdev) = self.get_bus_device(DeviceType::Virtio(TYPE_BALLOON), BALLOON_DEV_ID) - { + if let Some(virtio_device) = self.get_virtio_device(TYPE_BALLOON, BALLOON_DEV_ID) { { - let virtio_device = busdev - .lock() - .expect("Poisoned lock") - .mmio_transport_ref() - .expect("Unexpected device type") - .device(); - virtio_device .lock() .expect("Poisoned lock") diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index 4699b80b185..da5a603d820 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -69,7 +69,7 @@ impl From<&VmResources> for VmInfo { } } -/// Contains the necesary state for saving/restoring a microVM. +/// Contains the necessary state for saving/restoring a microVM. #[derive(Debug, Default, Serialize, Deserialize)] pub struct MicrovmState { /// Miscellaneous VM info. @@ -172,8 +172,9 @@ pub fn create_snapshot( // This should never fail as we only mark pages only if device has already been activated, // and the address validation was already performed on device activation. vmm.mmio_device_manager - .for_each_virtio_device(|_, _, _, dev| { - let mut d = dev.lock().unwrap(); + .for_each_virtio_device(|_, _, device| { + let mmio_dev_locked = device.inner.lock().expect("Poisoned lock"); + let d = mmio_dev_locked.locked_device(); if d.is_activated() { d.mark_queue_memory_dirty(vmm.vm.guest_memory()) } else { diff --git a/src/vmm/src/vstate/vcpu.rs b/src/vmm/src/vstate/vcpu.rs index fdd73b9c175..1c2991fd267 100644 --- a/src/vmm/src/vstate/vcpu.rs +++ b/src/vmm/src/vstate/vcpu.rs @@ -219,7 +219,7 @@ impl Vcpu { } /// Sets a MMIO bus for this vcpu. - pub fn set_mmio_bus(&mut self, mmio_bus: crate::devices::Bus) { + pub fn set_mmio_bus(&mut self, mmio_bus: Arc) { self.kvm_vcpu.peripherals.mmio_bus = Some(mmio_bus); } @@ -527,7 +527,9 @@ fn handle_kvm_exit( VcpuExit::MmioRead(addr, data) => { if let Some(mmio_bus) = &peripherals.mmio_bus { let _metric = METRICS.vcpu.exit_mmio_read_agg.record_latency_metrics(); - mmio_bus.read(addr, data); + if let Err(err) = mmio_bus.read(addr, data) { + warn!("Invalid MMIO read @ {addr:#x}:{:#x}: {err}", data.len()); + } METRICS.vcpu.exit_mmio_read.inc(); } Ok(VcpuEmulation::Handled) @@ -535,7 +537,9 @@ fn handle_kvm_exit( VcpuExit::MmioWrite(addr, data) => { if let Some(mmio_bus) = &peripherals.mmio_bus { let _metric = METRICS.vcpu.exit_mmio_write_agg.record_latency_metrics(); - mmio_bus.write(addr, data); + if let Err(err) = mmio_bus.write(addr, data) { + warn!("Invalid MMIO read @ {addr:#x}:{:#x}: {err}", data.len()); + } METRICS.vcpu.exit_mmio_write.inc(); } Ok(VcpuEmulation::Handled) @@ -771,7 +775,6 @@ pub(crate) mod tests { use super::*; use crate::RECV_TIMEOUT_SEC; use crate::arch::{BootProtocol, EntryPoint}; - use crate::devices::BusDevice; use crate::devices::bus::DummyDevice; use crate::seccomp::get_empty_filters; use crate::utils::mib_to_bytes; @@ -876,8 +879,8 @@ pub(crate) mod tests { ) ); - let mut bus = crate::devices::Bus::new(); - let dummy = Arc::new(Mutex::new(BusDevice::Dummy(DummyDevice))); + let bus = Arc::new(vm_device::Bus::new()); + let dummy = Arc::new(Mutex::new(DummyDevice)); bus.insert(dummy, 0x10, 0x10).unwrap(); vcpu.set_mmio_bus(bus); let addr = 0x10; @@ -1020,7 +1023,7 @@ pub(crate) mod tests { fn test_set_mmio_bus() { let (_, _, mut vcpu) = setup_vcpu(0x1000); assert!(vcpu.kvm_vcpu.peripherals.mmio_bus.is_none()); - vcpu.set_mmio_bus(crate::devices::Bus::new()); + vcpu.set_mmio_bus(Arc::new(vm_device::Bus::new())); assert!(vcpu.kvm_vcpu.peripherals.mmio_bus.is_some()); } From 065e368312e9c191ad08a5c7a2f0c67ffffdee70 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 30 Apr 2025 17:25:59 +0200 Subject: [PATCH 09/56] refactor: simplify creating RTCDevice We were always constructing RTCDevice using a set of metrics that were defined in the RTC module itself. Don't leak the metrics to other modules. Instead, create a new() function that always constructs it the correct way. Signed-off-by: Babis Chalios --- src/vmm/src/builder.rs | 6 +----- src/vmm/src/device_manager/persist.rs | 4 +--- src/vmm/src/devices/legacy/rtc_pl031.rs | 15 ++++++++++++++- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index ac527b16747..f9ad80616fe 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -15,8 +15,6 @@ use userfaultfd::Uffd; use utils::time::TimestampUs; #[cfg(target_arch = "aarch64")] use vm_memory::GuestAddress; -#[cfg(target_arch = "aarch64")] -use vm_superio::Rtc; use crate::arch::{ConfigurationError, configure_system_for_boot, load_kernel}; #[cfg(target_arch = "aarch64")] @@ -628,9 +626,7 @@ fn attach_legacy_devices_aarch64( .map_err(VmmError::RegisterMMIODevice)?; } - let rtc = RTCDevice(Rtc::with_events( - &crate::devices::legacy::rtc_pl031::METRICS, - )); + let rtc = RTCDevice::new(); vmm.mmio_device_manager .register_mmio_rtc(&mut vmm.resource_allocator, rtc, None) .map_err(VmmError::RegisterMMIODevice) diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 9d06ce78350..a983140ed21 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -449,9 +449,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { )?; } if state.type_ == DeviceType::Rtc { - let rtc = RTCDevice(vm_superio::Rtc::with_events( - &crate::devices::legacy::rtc_pl031::METRICS, - )); + let rtc = RTCDevice::new(); constructor_args .resource_allocator .allocate_mmio_memory( diff --git a/src/vmm/src/devices/legacy/rtc_pl031.rs b/src/vmm/src/devices/legacy/rtc_pl031.rs index b7ebc827e85..b025c1d1512 100644 --- a/src/vmm/src/devices/legacy/rtc_pl031.rs +++ b/src/vmm/src/devices/legacy/rtc_pl031.rs @@ -4,6 +4,7 @@ use std::convert::TryInto; use serde::Serialize; +use vm_superio::Rtc; use vm_superio::rtc_pl031::RtcEvents; use crate::logger::{IncMetric, SharedIncMetric, warn}; @@ -59,7 +60,19 @@ pub static METRICS: RTCDeviceMetrics = RTCDeviceMetrics::new(); /// Wrapper over vm_superio's RTC implementation. #[derive(Debug)] -pub struct RTCDevice(pub vm_superio::Rtc<&'static RTCDeviceMetrics>); +pub struct RTCDevice(vm_superio::Rtc<&'static RTCDeviceMetrics>); + +impl Default for RTCDevice { + fn default() -> Self { + RTCDevice(Rtc::with_events(&METRICS)) + } +} + +impl RTCDevice { + pub fn new() -> RTCDevice { + Default::default() + } +} impl std::ops::Deref for RTCDevice { type Target = vm_superio::Rtc<&'static RTCDeviceMetrics>; From ee8c8043defbcb17fe6b3ea7a46900f28329a9c7 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Thu, 24 Apr 2025 17:14:25 +0200 Subject: [PATCH 10/56] refactor: use vm_device::Bus for IO bus Use the vm_device::Bus bus for PortIO devices on x86. PCIe devices will use this as well. Signed-off-by: Babis Chalios --- src/vmm/src/arch/x86_64/vcpu.rs | 12 +- src/vmm/src/builder.rs | 24 +- src/vmm/src/device_manager/legacy.rs | 51 ++-- src/vmm/src/devices/bus.rs | 354 --------------------------- src/vmm/src/devices/legacy/i8042.rs | 61 ++--- src/vmm/src/devices/legacy/serial.rs | 33 +-- src/vmm/src/devices/mod.rs | 2 - src/vmm/src/lib.rs | 5 +- src/vmm/src/vstate/vcpu.rs | 12 +- src/vmm/tests/devices.rs | 13 +- src/vmm/tests/integration_tests.rs | 2 +- 11 files changed, 112 insertions(+), 457 deletions(-) delete mode 100644 src/vmm/src/devices/bus.rs diff --git a/src/vmm/src/arch/x86_64/vcpu.rs b/src/vmm/src/arch/x86_64/vcpu.rs index 4eb609aadd6..eea1f24ae69 100644 --- a/src/vmm/src/arch/x86_64/vcpu.rs +++ b/src/vmm/src/arch/x86_64/vcpu.rs @@ -160,7 +160,7 @@ pub struct KvmVcpu { #[derive(Default, Debug)] pub struct Peripherals { /// Pio bus. - pub pio_bus: Option, + pub pio_bus: Option>, /// Mmio bus. pub mmio_bus: Option>, } @@ -267,7 +267,7 @@ impl KvmVcpu { } /// Sets a Port Mapped IO bus for this vcpu. - pub fn set_pio_bus(&mut self, pio_bus: crate::devices::Bus) { + pub fn set_pio_bus(&mut self, pio_bus: Arc) { self.peripherals.pio_bus = Some(pio_bus); } @@ -711,7 +711,9 @@ impl Peripherals { VcpuExit::IoIn(addr, data) => { if let Some(pio_bus) = &self.pio_bus { let _metric = METRICS.vcpu.exit_io_in_agg.record_latency_metrics(); - pio_bus.read(u64::from(addr), data); + if let Err(err) = pio_bus.read(u64::from(addr), data) { + warn!("vcpu: IO read @ {addr:#x}:{:#x} failed: {err}", data.len()); + } METRICS.vcpu.exit_io_in.inc(); } Ok(VcpuEmulation::Handled) @@ -719,7 +721,9 @@ impl Peripherals { VcpuExit::IoOut(addr, data) => { if let Some(pio_bus) = &self.pio_bus { let _metric = METRICS.vcpu.exit_io_out_agg.record_latency_metrics(); - pio_bus.write(u64::from(addr), data); + if let Err(err) = pio_bus.write(u64::from(addr), data) { + warn!("vcpu: IO write @ {addr:#x}:{:#x} failed: {err}", data.len()); + } METRICS.vcpu.exit_io_out.inc(); } Ok(VcpuEmulation::Handled) diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index f9ad80616fe..5cba0ffbfe8 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -15,6 +15,8 @@ use userfaultfd::Uffd; use utils::time::TimestampUs; #[cfg(target_arch = "aarch64")] use vm_memory::GuestAddress; +#[cfg(target_arch = "x86_64")] +use vmm_sys_util::eventfd::EventFd; use crate::arch::{ConfigurationError, configure_system_for_boot, load_kernel}; #[cfg(target_arch = "aarch64")] @@ -30,8 +32,9 @@ use crate::device_manager::persist::{ ACPIDeviceManagerConstructorArgs, ACPIDeviceManagerRestoreError, MMIODevManagerConstructorArgs, }; use crate::device_manager::resources::ResourceAllocator; -use crate::devices::BusDevice; use crate::devices::acpi::vmgenid::{VmGenId, VmGenIdError}; +#[cfg(target_arch = "x86_64")] +use crate::devices::legacy::I8042Device; #[cfg(target_arch = "aarch64")] use crate::devices::legacy::RTCDevice; use crate::devices::legacy::SerialDevice; @@ -162,10 +165,14 @@ fn create_vmm_and_vcpus( // x86_64 uses the i8042 reset event as the Vmm exit event. let reset_evt = vcpus_exit_evt.try_clone().map_err(VmmError::EventFd)?; + let i8042 = Arc::new(Mutex::new(I8042Device::new( + reset_evt, + EventFd::new(libc::EFD_NONBLOCK).map_err(VmmError::EventFd)?, + ))); // create pio dev manager with legacy devices let mut pio_dev_mgr = - PortIODeviceManager::new(serial_device, reset_evt).map_err(VmmError::LegacyIOBus)?; + PortIODeviceManager::new(serial_device, i8042).map_err(VmmError::LegacyIOBus)?; pio_dev_mgr .register_devices(vm.fd()) .map_err(VmmError::LegacyIOBus)?; @@ -549,11 +556,11 @@ pub fn build_microvm_from_snapshot( /// Sets up the serial device. pub fn setup_serial_device( event_manager: &mut EventManager, -) -> Result>, VmmError> { - let serial = Arc::new(Mutex::new(BusDevice::Serial( +) -> Result>, VmmError> { + let serial = Arc::new(Mutex::new( SerialDevice::new(Some(std::io::stdin()), SerialOut::Stdout(std::io::stdout())) .map_err(VmmError::EventFd)?, - ))); + )); event_manager.add_subscriber(serial.clone()); Ok(serial) } @@ -879,10 +886,13 @@ pub(crate) mod tests { let acpi_device_manager = ACPIDeviceManager::new(); #[cfg(target_arch = "x86_64")] let pio_device_manager = PortIODeviceManager::new( - Arc::new(Mutex::new(BusDevice::Serial( + Arc::new(Mutex::new( SerialDevice::new(None, SerialOut::Sink(std::io::sink())).unwrap(), + )), + Arc::new(Mutex::new(I8042Device::new( + EventFd::new(libc::EFD_NONBLOCK).unwrap(), + EventFd::new(libc::EFD_NONBLOCK).unwrap(), ))), - EventFd::new(libc::EFD_NONBLOCK).unwrap(), ) .unwrap(); diff --git a/src/vmm/src/device_manager/legacy.rs b/src/vmm/src/device_manager/legacy.rs index 20b008769a5..0af1ae3348a 100644 --- a/src/vmm/src/device_manager/legacy.rs +++ b/src/vmm/src/device_manager/legacy.rs @@ -16,15 +16,14 @@ use libc::EFD_NONBLOCK; use vm_superio::Serial; use vmm_sys_util::eventfd::EventFd; -use crate::devices::bus::BusDevice; use crate::devices::legacy::serial::SerialOut; -use crate::devices::legacy::{EventFdTrigger, SerialDevice, SerialEventsWrapper}; +use crate::devices::legacy::{EventFdTrigger, I8042Device, SerialDevice, SerialEventsWrapper}; /// Errors corresponding to the `PortIODeviceManager`. #[derive(Debug, derive_more::From, thiserror::Error, displaydoc::Display)] pub enum LegacyDeviceError { /// Failed to add legacy device to Bus: {0} - BusError(crate::devices::BusError), + BusError(vm_device::BusError), /// Failed to create EventFd: {0} EventFd(std::io::Error), } @@ -34,11 +33,11 @@ pub enum LegacyDeviceError { /// The `LegacyDeviceManger` should be initialized only by using the constructor. #[derive(Debug)] pub struct PortIODeviceManager { - pub io_bus: crate::devices::Bus, + pub io_bus: Arc, // BusDevice::Serial - pub stdio_serial: Arc>, + pub stdio_serial: Arc>, // BusDevice::I8042Device - pub i8042: Arc>, + pub i8042: Arc>, // Communication event on ports 1 & 3. pub com_evt_1_3: EventFdTrigger, @@ -73,29 +72,26 @@ impl PortIODeviceManager { /// Create a new DeviceManager handling legacy devices (uart, i8042). pub fn new( - serial: Arc>, - i8042_reset_evfd: EventFd, + stdio_serial: Arc>, + i8042: Arc>, ) -> Result { - debug_assert!(matches!(*serial.lock().unwrap(), BusDevice::Serial(_))); - let io_bus = crate::devices::Bus::new(); - let com_evt_1_3 = serial + let io_bus = Arc::new(vm_device::Bus::new()); + let com_evt_1_3 = stdio_serial .lock() .expect("Poisoned lock") - .serial_mut() - .unwrap() .serial .interrupt_evt() .try_clone()?; let com_evt_2_4 = EventFdTrigger::new(EventFd::new(EFD_NONBLOCK)?); - let kbd_evt = EventFd::new(libc::EFD_NONBLOCK)?; - - let i8042 = Arc::new(Mutex::new(BusDevice::I8042Device( - crate::devices::legacy::I8042Device::new(i8042_reset_evfd, kbd_evt.try_clone()?), - ))); + let kbd_evt = i8042 + .lock() + .expect("Poisoned lock") + .kbd_interrupt_evt + .try_clone()?; Ok(PortIODeviceManager { io_bus, - stdio_serial: serial, + stdio_serial, i8042, com_evt_1_3, com_evt_2_4, @@ -105,7 +101,7 @@ impl PortIODeviceManager { /// Register supported legacy devices. pub fn register_devices(&mut self, vm_fd: &VmFd) -> Result<(), LegacyDeviceError> { - let serial_2_4 = Arc::new(Mutex::new(BusDevice::Serial(SerialDevice { + let serial_2_4 = Arc::new(Mutex::new(SerialDevice { serial: Serial::with_events( self.com_evt_2_4.try_clone()?.try_clone()?, SerialEventsWrapper { @@ -114,8 +110,8 @@ impl PortIODeviceManager { SerialOut::Sink(std::io::sink()), ), input: None, - }))); - let serial_1_3 = Arc::new(Mutex::new(BusDevice::Serial(SerialDevice { + })); + let serial_1_3 = Arc::new(Mutex::new(SerialDevice { serial: Serial::with_events( self.com_evt_1_3.try_clone()?.try_clone()?, SerialEventsWrapper { @@ -124,7 +120,7 @@ impl PortIODeviceManager { SerialOut::Sink(std::io::sink()), ), input: None, - }))); + })); self.io_bus.insert( self.stdio_serial.clone(), Self::SERIAL_PORT_ADDRESSES[0], @@ -251,7 +247,7 @@ mod tests { let (_, vm) = setup_vm_with_memory(0x1000); vm.setup_irqchip().unwrap(); let mut ldm = PortIODeviceManager::new( - Arc::new(Mutex::new(BusDevice::Serial(SerialDevice { + Arc::new(Mutex::new(SerialDevice { serial: Serial::with_events( EventFdTrigger::new(EventFd::new(EFD_NONBLOCK).unwrap()), SerialEventsWrapper { @@ -260,8 +256,11 @@ mod tests { SerialOut::Sink(std::io::sink()), ), input: None, - }))), - EventFd::new(libc::EFD_NONBLOCK).unwrap(), + })), + Arc::new(Mutex::new(I8042Device::new( + EventFd::new(libc::EFD_NONBLOCK).unwrap(), + EventFd::new(libc::EFD_NONBLOCK).unwrap(), + ))), ) .unwrap(); ldm.register_devices(vm.fd()).unwrap(); diff --git a/src/vmm/src/devices/bus.rs b/src/vmm/src/devices/bus.rs deleted file mode 100644 index 6f7e1531bf3..00000000000 --- a/src/vmm/src/devices/bus.rs +++ /dev/null @@ -1,354 +0,0 @@ -// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 -// -// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the THIRD-PARTY file. - -//! Handles routing to devices in an address space. - -use std::cmp::{Ord, Ordering, PartialEq, PartialOrd}; -use std::collections::btree_map::BTreeMap; -#[cfg(test)] -use std::sync::Barrier; -use std::sync::{Arc, Mutex}; - -/// Errors triggered during bus operations. -#[derive(Debug, thiserror::Error, displaydoc::Display)] -pub enum BusError { - /// New device overlaps with an old device. - Overlap, -} - -#[derive(Debug, Copy, Clone)] -struct BusRange(u64, u64); - -impl Eq for BusRange {} - -impl PartialEq for BusRange { - fn eq(&self, other: &BusRange) -> bool { - self.0 == other.0 - } -} - -impl Ord for BusRange { - fn cmp(&self, other: &BusRange) -> Ordering { - self.0.cmp(&other.0) - } -} - -impl PartialOrd for BusRange { - fn partial_cmp(&self, other: &BusRange) -> Option { - Some(self.cmp(other)) - } -} - -/// A device container for routing reads and writes over some address space. -/// -/// This doesn't have any restrictions on what kind of device or address space this applies to. The -/// only restriction is that no two devices can overlap in this address space. -#[derive(Debug, Clone, Default)] -pub struct Bus { - devices: BTreeMap>>, -} - -use event_manager::{EventOps, Events, MutEventSubscriber}; - -#[cfg(target_arch = "aarch64")] -use super::legacy::RTCDevice; -use super::legacy::{I8042Device, SerialDevice}; - -#[derive(Debug)] -pub enum BusDevice { - I8042Device(I8042Device), - #[cfg(target_arch = "aarch64")] - RTCDevice(RTCDevice), - Serial(SerialDevice), - #[cfg(test)] - Constant(ConstantDevice), -} - -#[cfg(test)] -#[derive(Debug)] -pub struct DummyDevice; - -#[cfg(test)] -impl vm_device::BusDevice for DummyDevice { - fn write(&mut self, _base: u64, _offset: u64, _data: &[u8]) -> Option> { - None - } - fn read(&mut self, _base: u64, _offset: u64, _data: &mut [u8]) {} -} - -#[cfg(test)] -#[derive(Debug)] -pub struct ConstantDevice; - -#[cfg(test)] -impl ConstantDevice { - pub fn bus_read(&mut self, offset: u64, data: &mut [u8]) { - for (i, v) in data.iter_mut().enumerate() { - *v = ((offset + i as u64) & 0xff) as u8; - } - } - - fn bus_write(&mut self, offset: u64, data: &[u8]) { - for (i, v) in data.iter().enumerate() { - assert_eq!(*v, ((offset + i as u64) & 0xff) as u8) - } - } -} - -impl BusDevice { - pub fn i8042_device_ref(&self) -> Option<&I8042Device> { - match self { - Self::I8042Device(x) => Some(x), - _ => None, - } - } - #[cfg(target_arch = "aarch64")] - pub fn rtc_device_ref(&self) -> Option<&RTCDevice> { - match self { - Self::RTCDevice(x) => Some(x), - _ => None, - } - } - pub fn serial_ref(&self) -> Option<&SerialDevice> { - match self { - Self::Serial(x) => Some(x), - _ => None, - } - } - - pub fn i8042_device_mut(&mut self) -> Option<&mut I8042Device> { - match self { - Self::I8042Device(x) => Some(x), - _ => None, - } - } - #[cfg(target_arch = "aarch64")] - pub fn rtc_device_mut(&mut self) -> Option<&mut RTCDevice> { - match self { - Self::RTCDevice(x) => Some(x), - _ => None, - } - } - pub fn serial_mut(&mut self) -> Option<&mut SerialDevice> { - match self { - Self::Serial(x) => Some(x), - _ => None, - } - } - - pub fn read(&mut self, offset: u64, data: &mut [u8]) { - match self { - Self::I8042Device(x) => x.bus_read(offset, data), - #[cfg(target_arch = "aarch64")] - Self::RTCDevice(x) => x.bus_read(offset, data), - Self::Serial(x) => x.bus_read(offset, data), - #[cfg(test)] - #[cfg(test)] - Self::Constant(x) => x.bus_read(offset, data), - } - } - - pub fn write(&mut self, offset: u64, data: &[u8]) { - match self { - Self::I8042Device(x) => x.bus_write(offset, data), - #[cfg(target_arch = "aarch64")] - Self::RTCDevice(x) => x.bus_write(offset, data), - Self::Serial(x) => x.bus_write(offset, data), - #[cfg(test)] - Self::Constant(x) => x.bus_write(offset, data), - } - } -} - -impl MutEventSubscriber for BusDevice { - fn process(&mut self, event: Events, ops: &mut EventOps) { - match self { - Self::Serial(serial) => serial.process(event, ops), - _ => panic!(), - } - } - fn init(&mut self, ops: &mut EventOps) { - match self { - Self::Serial(serial) => serial.init(ops), - _ => panic!(), - } - } -} - -impl Bus { - /// Constructs an a bus with an empty address space. - pub fn new() -> Bus { - Bus { - devices: BTreeMap::new(), - } - } - - fn first_before(&self, addr: u64) -> Option<(BusRange, &Mutex)> { - // for when we switch to rustc 1.17: self.devices.range(..addr).iter().rev().next() - for (range, dev) in self.devices.iter().rev() { - if range.0 <= addr { - return Some((*range, dev)); - } - } - None - } - - /// Returns the device found at some address. - pub fn get_device(&self, addr: u64) -> Option<(u64, &Mutex)> { - if let Some((BusRange(start, len), dev)) = self.first_before(addr) { - let offset = addr - start; - if offset < len { - return Some((offset, dev)); - } - } - None - } - - /// Puts the given device at the given address space. - pub fn insert( - &mut self, - device: Arc>, - base: u64, - len: u64, - ) -> Result<(), BusError> { - if len == 0 { - return Err(BusError::Overlap); - } - - // Reject all cases where the new device's base is within an old device's range. - if self.get_device(base).is_some() { - return Err(BusError::Overlap); - } - - // The above check will miss an overlap in which the new device's base address is before the - // range of another device. To catch that case, we search for a device with a range before - // the new device's range's end. If there is no existing device in that range that starts - // after the new device, then there will be no overlap. - if let Some((BusRange(start, _), _)) = self.first_before(base + len - 1) { - // Such a device only conflicts with the new device if it also starts after the new - // device because of our initial `get_device` check above. - if start >= base { - return Err(BusError::Overlap); - } - } - - if self.devices.insert(BusRange(base, len), device).is_some() { - return Err(BusError::Overlap); - } - - Ok(()) - } - - /// Reads data from the device that owns the range containing `addr` and puts it into `data`. - /// - /// Returns true on success, otherwise `data` is untouched. - pub fn read(&self, addr: u64, data: &mut [u8]) -> bool { - if let Some((offset, dev)) = self.get_device(addr) { - // OK to unwrap as lock() failing is a serious error condition and should panic. - dev.lock() - .expect("Failed to acquire device lock") - .read(offset, data); - true - } else { - false - } - } - - /// Writes `data` to the device that owns the range containing `addr`. - /// - /// Returns true on success, otherwise `data` is untouched. - pub fn write(&self, addr: u64, data: &[u8]) -> bool { - if let Some((offset, dev)) = self.get_device(addr) { - // OK to unwrap as lock() failing is a serious error condition and should panic. - dev.lock() - .expect("Failed to acquire device lock") - .write(offset, data); - true - } else { - false - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn bus_insert() { - let mut bus = Bus::new(); - let dummy = Arc::new(Mutex::new(BusDevice::Constant(ConstantDevice))); - // Insert len should not be 0. - bus.insert(dummy.clone(), 0x10, 0).unwrap_err(); - bus.insert(dummy.clone(), 0x10, 0x10).unwrap(); - - let result = bus.insert(dummy.clone(), 0x0f, 0x10); - // This overlaps the address space of the existing bus device at 0x10. - assert!(matches!(result, Err(BusError::Overlap)), "{:?}", result); - - // This overlaps the address space of the existing bus device at 0x10. - bus.insert(dummy.clone(), 0x10, 0x10).unwrap_err(); - // This overlaps the address space of the existing bus device at 0x10. - bus.insert(dummy.clone(), 0x10, 0x15).unwrap_err(); - // This overlaps the address space of the existing bus device at 0x10. - bus.insert(dummy.clone(), 0x12, 0x15).unwrap_err(); - // This overlaps the address space of the existing bus device at 0x10. - bus.insert(dummy.clone(), 0x12, 0x01).unwrap_err(); - // This overlaps the address space of the existing bus device at 0x10. - bus.insert(dummy.clone(), 0x0, 0x20).unwrap_err(); - bus.insert(dummy.clone(), 0x20, 0x05).unwrap(); - bus.insert(dummy.clone(), 0x25, 0x05).unwrap(); - bus.insert(dummy, 0x0, 0x10).unwrap(); - } - - #[test] - fn bus_read_write_values() { - let mut bus = Bus::new(); - let dummy = Arc::new(Mutex::new(BusDevice::Constant(ConstantDevice))); - bus.insert(dummy, 0x10, 0x10).unwrap(); - - let mut values = [0, 1, 2, 3]; - assert!(bus.read(0x10, &mut values)); - assert_eq!(values, [0, 1, 2, 3]); - assert!(bus.write(0x10, &values)); - assert!(bus.read(0x15, &mut values)); - assert_eq!(values, [5, 6, 7, 8]); - assert!(bus.write(0x15, &values)); - } - - #[test] - fn busrange_cmp_and_clone() { - assert_eq!(BusRange(0x10, 2), BusRange(0x10, 3)); - assert_eq!(BusRange(0x10, 2), BusRange(0x10, 2)); - - assert!(BusRange(0x10, 2) < BusRange(0x12, 1)); - assert!(BusRange(0x10, 2) < BusRange(0x12, 3)); - - let mut bus = Bus::new(); - let mut data = [1, 2, 3, 4]; - bus.insert( - Arc::new(Mutex::new(BusDevice::Constant(ConstantDevice))), - 0x10, - 0x10, - ) - .unwrap(); - assert!(bus.write(0x10, &data)); - let bus_clone = bus.clone(); - assert!(bus.read(0x10, &mut data)); - assert_eq!(data, [1, 2, 3, 4]); - assert!(bus_clone.read(0x10, &mut data)); - assert_eq!(data, [1, 2, 3, 4]); - } - - #[test] - fn test_display_error() { - assert_eq!( - format!("{}", BusError::Overlap), - "New device overlaps with an old device." - ); - } -} diff --git a/src/vmm/src/devices/legacy/i8042.rs b/src/vmm/src/devices/legacy/i8042.rs index bcf7bdd8c90..1bc830bd13b 100644 --- a/src/vmm/src/devices/legacy/i8042.rs +++ b/src/vmm/src/devices/legacy/i8042.rs @@ -7,6 +7,7 @@ use std::io; use std::num::Wrapping; +use std::sync::{Arc, Barrier}; use log::warn; use serde::Serialize; @@ -96,7 +97,7 @@ pub struct I8042Device { reset_evt: EventFd, /// Keyboard interrupt event (IRQ 1). - kbd_interrupt_evt: EventFd, + pub kbd_interrupt_evt: EventFd, /// The i8042 status register. status: u8, @@ -209,8 +210,8 @@ impl I8042Device { } } -impl I8042Device { - pub fn bus_read(&mut self, offset: u64, data: &mut [u8]) { +impl vm_device::BusDevice for I8042Device { + fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { // All our ports are byte-wide. We don't know how to handle any wider data. if data.len() != 1 { METRICS.missed_read_count.inc(); @@ -245,11 +246,11 @@ impl I8042Device { } } - pub fn bus_write(&mut self, offset: u64, data: &[u8]) { + fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { // All our ports are byte-wide. We don't know how to handle any wider data. if data.len() != 1 { METRICS.missed_write_count.inc(); - return; + return None; } let mut write_ok = true; @@ -335,11 +336,15 @@ impl I8042Device { } else { METRICS.missed_write_count.inc(); } + + None } } #[cfg(test)] mod tests { + use vm_device::BusDevice; + use super::*; impl PartialEq for I8042Error { @@ -358,9 +363,9 @@ mod tests { // Check if reading in a 2-length array doesn't have side effects. let mut data = [1, 2]; - i8042.bus_read(0, &mut data); + i8042.read(0x0, 0, &mut data); assert_eq!(data, [1, 2]); - i8042.bus_read(1, &mut data); + i8042.read(0x0, 1, &mut data); assert_eq!(data, [1, 2]); // Check if reset works. @@ -368,23 +373,23 @@ mod tests { // counter doesn't change (for 0 it blocks). reset_evt.write(1).unwrap(); let mut data = [CMD_RESET_CPU]; - i8042.bus_write(OFS_STATUS, &data); + i8042.write(0x0, OFS_STATUS, &data); assert_eq!(reset_evt.read().unwrap(), 2); // Check if reading with offset 1 doesn't have side effects. - i8042.bus_read(1, &mut data); + i8042.read(0x0, 1, &mut data); assert_eq!(data[0], CMD_RESET_CPU); // Check invalid `write`s. let before = METRICS.missed_write_count.count(); // offset != 0. - i8042.bus_write(1, &data); + i8042.write(0x0, 1, &data); // data != CMD_RESET_CPU data[0] = CMD_RESET_CPU + 1; - i8042.bus_write(1, &data); + i8042.write(0x0, 1, &data); // data.len() != 1 let data = [CMD_RESET_CPU; 2]; - i8042.bus_write(1, &data); + i8042.write(0x0, 1, &data); assert_eq!(METRICS.missed_write_count.count(), before + 3); } @@ -398,33 +403,33 @@ mod tests { // Test reading/writing the control register. data[0] = CMD_WRITE_CTR; - i8042.bus_write(OFS_STATUS, &data); + i8042.write(0x0, OFS_STATUS, &data); assert_ne!(i8042.status & SB_I8042_CMD_DATA, 0); data[0] = 0x52; - i8042.bus_write(OFS_DATA, &data); + i8042.write(0x0, OFS_DATA, &data); data[0] = CMD_READ_CTR; - i8042.bus_write(OFS_STATUS, &data); + i8042.write(0x0, OFS_STATUS, &data); assert_ne!(i8042.status & SB_OUT_DATA_AVAIL, 0); - i8042.bus_read(OFS_DATA, &mut data); + i8042.read(0x0, OFS_DATA, &mut data); assert_eq!(data[0], 0x52); // Test reading/writing the output port. data[0] = CMD_WRITE_OUTP; - i8042.bus_write(OFS_STATUS, &data); + i8042.write(0x0, OFS_STATUS, &data); assert_ne!(i8042.status & SB_I8042_CMD_DATA, 0); data[0] = 0x52; - i8042.bus_write(OFS_DATA, &data); + i8042.write(0x0, OFS_DATA, &data); data[0] = CMD_READ_OUTP; - i8042.bus_write(OFS_STATUS, &data); + i8042.write(0x0, OFS_STATUS, &data); assert_ne!(i8042.status & SB_OUT_DATA_AVAIL, 0); - i8042.bus_read(OFS_DATA, &mut data); + i8042.read(0x0, OFS_DATA, &mut data); assert_eq!(data[0], 0x52); // Test kbd commands. data[0] = 0x52; - i8042.bus_write(OFS_DATA, &data); + i8042.write(0x0, OFS_DATA, &data); assert_ne!(i8042.status & SB_OUT_DATA_AVAIL, 0); - i8042.bus_read(OFS_DATA, &mut data); + i8042.read(0x0, OFS_DATA, &mut data); assert_eq!(data[0], 0xFA); } @@ -470,13 +475,13 @@ mod tests { assert!(i8042.kbd_interrupt_evt.read().unwrap() > 1); // The "data available" flag should be on. - i8042.bus_read(OFS_STATUS, &mut data); + i8042.read(0x0, OFS_STATUS, &mut data); let mut key_byte: u8; if key & 0xFF00 != 0 { // For extended keys, we should be able to read the MSB first. key_byte = ((key & 0xFF00) >> 8) as u8; - i8042.bus_read(OFS_DATA, &mut data); + i8042.read(0x0, OFS_DATA, &mut data); assert_eq!(data[0], key_byte); // And then do the same for the LSB. @@ -485,10 +490,10 @@ mod tests { i8042.trigger_kbd_interrupt().unwrap(); assert!(i8042.kbd_interrupt_evt.read().unwrap() > 1); // The "data available" flag should be on. - i8042.bus_read(OFS_STATUS, &mut data); + i8042.read(0x0, OFS_STATUS, &mut data); } key_byte = (key & 0xFF) as u8; - i8042.bus_read(OFS_DATA, &mut data); + i8042.read(0x0, OFS_DATA, &mut data); assert_eq!(data[0], key_byte); } @@ -530,9 +535,9 @@ mod tests { // Test kbd interrupt disable. let mut data = [1]; data[0] = CMD_WRITE_CTR; - i8042.bus_write(OFS_STATUS, &data); + i8042.write(0x0, OFS_STATUS, &data); data[0] = i8042.control & !CB_KBD_INT; - i8042.bus_write(OFS_DATA, &data); + i8042.write(0x0, OFS_DATA, &data); i8042.trigger_key(KEY_CTRL).unwrap(); assert_eq!( i8042.trigger_kbd_interrupt().unwrap_err(), diff --git a/src/vmm/src/devices/legacy/serial.rs b/src/vmm/src/devices/legacy/serial.rs index b895635e56b..afc47189c1e 100644 --- a/src/vmm/src/devices/legacy/serial.rs +++ b/src/vmm/src/devices/legacy/serial.rs @@ -9,6 +9,7 @@ use std::fmt::Debug; use std::io::{self, Read, Stdin, Write}; use std::os::unix::io::{AsRawFd, RawFd}; +use std::sync::{Arc, Barrier}; use event_manager::{EventOps, Events, MutEventSubscriber}; use libc::EFD_NONBLOCK; @@ -358,10 +359,11 @@ fn is_fifo(fd: RawFd) -> bool { (stat.st_mode & libc::S_IFIFO) != 0 } -impl - SerialWrapper +impl vm_device::BusDevice for SerialWrapper +where + I: Read + AsRawFd + Send, { - pub fn bus_read(&mut self, offset: u64, data: &mut [u8]) { + fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { if let (Ok(offset), 1) = (u8::try_from(offset), data.len()) { data[0] = self.serial.read(offset); } else { @@ -369,7 +371,7 @@ impl } } - pub fn bus_write(&mut self, offset: u64, data: &[u8]) { + fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { if let (Ok(offset), 1) = (u8::try_from(offset), data.len()) { if let Err(err) = self.serial.write(offset, data[0]) { // Counter incremented for any handle_write() error. @@ -379,24 +381,6 @@ impl } else { METRICS.missed_write_count.inc(); } - } -} - -#[cfg(target_arch = "aarch64")] -impl vm_device::BusDevice - for SerialWrapper -{ - fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { - self.bus_read(offset, data) - } - - fn write( - &mut self, - _base: u64, - offset: u64, - data: &[u8], - ) -> Option> { - self.bus_write(offset, data); None } } @@ -405,6 +389,7 @@ impl vm_device::BusDevice mod tests { #![allow(clippy::undocumented_unsafe_blocks)] + use vm_device::BusDevice; use vmm_sys_util::eventfd::EventFd; use super::*; @@ -430,13 +415,13 @@ mod tests { let invalid_reads_before = metrics.missed_read_count.count(); let mut v = [0x00; 2]; - serial.bus_read(0u64, &mut v); + serial.read(0x0, 0u64, &mut v); let invalid_reads_after = metrics.missed_read_count.count(); assert_eq!(invalid_reads_before + 1, invalid_reads_after); let mut v = [0x00; 1]; - serial.bus_read(0u64, &mut v); + serial.read(0x0, 0u64, &mut v); assert_eq!(v[0], b'a'); let invalid_reads_after_2 = metrics.missed_read_count.count(); diff --git a/src/vmm/src/devices/mod.rs b/src/vmm/src/devices/mod.rs index 495e1507edd..dd58acc9337 100644 --- a/src/vmm/src/devices/mod.rs +++ b/src/vmm/src/devices/mod.rs @@ -10,12 +10,10 @@ use std::io; pub mod acpi; -pub mod bus; pub mod legacy; pub mod pseudo; pub mod virtio; -pub use bus::{Bus, BusDevice, BusError}; use log::error; use crate::devices::virtio::net::metrics::NetDeviceMetrics; diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 91be64e1d67..10411c24487 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -469,12 +469,11 @@ impl Vmm { #[cfg(target_arch = "x86_64")] { - let mut guard = self + let mut serial = self .pio_device_manager .stdio_serial .lock() .expect("Poisoned lock"); - let serial = guard.serial_mut().unwrap(); serial .serial @@ -491,8 +490,6 @@ impl Vmm { .i8042 .lock() .expect("i8042 lock was poisoned") - .i8042_device_mut() - .unwrap() .trigger_ctrl_alt_del() .map_err(VmmError::I8042Error) } diff --git a/src/vmm/src/vstate/vcpu.rs b/src/vmm/src/vstate/vcpu.rs index 1c2991fd267..8a1728dffca 100644 --- a/src/vmm/src/vstate/vcpu.rs +++ b/src/vmm/src/vstate/vcpu.rs @@ -770,12 +770,12 @@ pub(crate) mod tests { use std::sync::{Arc, Barrier, Mutex}; use linux_loader::loader::KernelLoader; + use vm_device::BusDevice; use vmm_sys_util::errno; use super::*; use crate::RECV_TIMEOUT_SEC; use crate::arch::{BootProtocol, EntryPoint}; - use crate::devices::bus::DummyDevice; use crate::seccomp::get_empty_filters; use crate::utils::mib_to_bytes; use crate::utils::signal::validate_signal_num; @@ -785,6 +785,16 @@ pub(crate) mod tests { use crate::vstate::vm::Vm; use crate::vstate::vm::tests::setup_vm_with_memory; + struct DummyDevice; + + impl BusDevice for DummyDevice { + fn read(&mut self, _base: u64, _offset: u64, _data: &mut [u8]) {} + + fn write(&mut self, _base: u64, _offset: u64, _data: &[u8]) -> Option> { + None + } + } + #[test] fn test_handle_kvm_exit() { let (_, _, mut vcpu) = setup_vcpu(0x1000); diff --git a/src/vmm/tests/devices.rs b/src/vmm/tests/devices.rs index 1850bf540b0..e86f4765af4 100644 --- a/src/vmm/tests/devices.rs +++ b/src/vmm/tests/devices.rs @@ -8,6 +8,7 @@ use std::sync::{Arc, Mutex}; use event_manager::{EventManager, SubscriberOps}; use libc::EFD_NONBLOCK; +use vm_device::BusDevice; use vm_superio::Serial; use vmm::devices::legacy::serial::SerialOut; use vmm::devices::legacy::{EventFdTrigger, SerialEventsWrapper, SerialWrapper}; @@ -91,7 +92,7 @@ fn test_issue_serial_hangup_anon_pipe_while_registered_stdin() { serial .lock() .unwrap() - .bus_read(data_bus_offset, &mut data[i..=i]); + .read(0x0, data_bus_offset, &mut data[i..=i]); } assert!(data[..31] == dummy_data[..31]); @@ -138,7 +139,7 @@ fn test_issue_serial_hangup_anon_pipe_while_registered_stdin() { serial .lock() .unwrap() - .bus_read(data_bus_offset, &mut data[i..=i]); + .read(0x0, data_bus_offset, &mut data[i..=i]); } // Process the kick stdin event generated by the reading of the 64th byte of the serial FIFO. @@ -152,7 +153,7 @@ fn test_issue_serial_hangup_anon_pipe_while_registered_stdin() { serial .lock() .unwrap() - .bus_read(data_bus_offset, &mut data[i..=i]); + .read(0x0, data_bus_offset, &mut data[i..=i]); } // We try to read again, but we detect that stdin received previously EOF. @@ -239,7 +240,7 @@ fn test_issue_serial_hangup_anon_pipe_while_unregistered_stdin() { serial .lock() .unwrap() - .bus_read(data_bus_offset, &mut data[i..=i]); + .read(0x0, data_bus_offset, &mut data[i..=i]); } assert!(data[..31] == dummy_data[..31]); @@ -289,7 +290,7 @@ fn test_issue_serial_hangup_anon_pipe_while_unregistered_stdin() { serial .lock() .unwrap() - .bus_read(data_bus_offset, &mut data[i..=i]); + .read(0x0, data_bus_offset, &mut data[i..=i]); } // Process the kick stdin event generated by the reading of the 64th byte of the serial FIFO. @@ -305,7 +306,7 @@ fn test_issue_serial_hangup_anon_pipe_while_unregistered_stdin() { serial .lock() .unwrap() - .bus_read(data_bus_offset, &mut data[i..=i]); + .read(0x0, data_bus_offset, &mut data[i..=i]); } // We try to read again, but we detect that stdin received previously EOF. diff --git a/src/vmm/tests/integration_tests.rs b/src/vmm/tests/integration_tests.rs index 55fb07c1aae..e513da695c0 100644 --- a/src/vmm/tests/integration_tests.rs +++ b/src/vmm/tests/integration_tests.rs @@ -227,7 +227,7 @@ fn verify_create_snapshot(is_diff: bool) -> (TempFile, TempFile) { // Check that we can deserialize the microVM state from `snapshot_file`. let snapshot_path = snapshot_file.as_path().to_path_buf(); let snapshot_file_metadata = std::fs::metadata(snapshot_path).unwrap(); - let snapshot_len = snapshot_file_metadata.len() as usize; + let snapshot_len = snapshot_file_metadata.len().try_into().unwrap(); let (restored_microvm_state, _) = Snapshot::load::<_, MicrovmState>(&mut snapshot_file.as_file(), snapshot_len).unwrap(); From 19cf2d446536de2747fd4d40ac28286f3e7db265 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Fri, 25 Apr 2025 16:52:48 +0200 Subject: [PATCH 11/56] refactor: add top-level device manager PCIe spec mandates that software can access the configuration space of PCIe devices both via MMIO and Port IO accesses. As a result, PCIe devices will need to register to both buses (on x86). Change the organization of devices, so that MMIO and PIO device managers do not own the buses. Instead, introduce a DeviceManager object which holds the buses, the resource allocator and includes also all types of device managers (at the moment MMIO, PIO and ACPI). Signed-off-by: Babis Chalios --- src/vmm/src/acpi/mod.rs | 109 +++-- src/vmm/src/arch/aarch64/fdt.rs | 107 ++--- src/vmm/src/arch/aarch64/mod.rs | 5 +- src/vmm/src/arch/x86_64/mod.rs | 10 +- src/vmm/src/builder.rs | 355 ++++---------- src/vmm/src/device_manager/legacy.rs | 22 +- src/vmm/src/device_manager/mmio.rs | 140 +++--- src/vmm/src/device_manager/mod.rs | 451 ++++++++++++++++++ src/vmm/src/device_manager/persist.rs | 54 ++- src/vmm/src/lib.rs | 101 +--- src/vmm/src/persist.rs | 29 +- src/vmm/tests/integration_tests.rs | 26 +- .../integration_tests/functional/test_api.py | 4 +- 13 files changed, 857 insertions(+), 556 deletions(-) diff --git a/src/vmm/src/acpi/mod.rs b/src/vmm/src/acpi/mod.rs index 0b5c5edcbde..542e53409b7 100644 --- a/src/vmm/src/acpi/mod.rs +++ b/src/vmm/src/acpi/mod.rs @@ -10,8 +10,7 @@ use crate::Vcpu; use crate::acpi::x86_64::{ apic_addr, rsdp_addr, setup_arch_dsdt, setup_arch_fadt, setup_interrupt_controllers, }; -use crate::device_manager::acpi::ACPIDeviceManager; -use crate::device_manager::mmio::MMIODeviceManager; +use crate::device_manager::DeviceManager; use crate::device_manager::resources::ResourceAllocator; use crate::vstate::memory::{GuestAddress, GuestMemoryMmap}; @@ -45,7 +44,6 @@ pub enum AcpiError { /// allocator for allocating space for the tables struct AcpiTableWriter<'a> { mem: &'a GuestMemoryMmap, - resource_allocator: &'a mut ResourceAllocator, } impl AcpiTableWriter<'_> { @@ -53,11 +51,15 @@ impl AcpiTableWriter<'_> { /// /// This will allocate enough space inside guest memory and write the table in the allocated /// buffer. It returns the address in which it wrote the table. - fn write_acpi_table(&mut self, table: &mut S) -> Result + fn write_acpi_table( + &mut self, + resource_allocator: &mut ResourceAllocator, + table: &mut S, + ) -> Result where S: Sdt, { - let addr = self.resource_allocator.allocate_system_memory( + let addr = resource_allocator.allocate_system_memory( table.len().try_into().unwrap(), 1, AllocPolicy::FirstMatch, @@ -77,30 +79,32 @@ impl AcpiTableWriter<'_> { } /// Build the DSDT table for the guest - fn build_dsdt( - &mut self, - mmio_device_manager: &MMIODeviceManager, - acpi_device_manager: &ACPIDeviceManager, - ) -> Result { + fn build_dsdt(&mut self, device_manager: &mut DeviceManager) -> Result { let mut dsdt_data = Vec::new(); // Virtio-devices DSDT data - dsdt_data.extend_from_slice(&mmio_device_manager.dsdt_data); + dsdt_data.extend_from_slice(&device_manager.mmio_devices.dsdt_data); // Add GED and VMGenID AML data. - acpi_device_manager.append_aml_bytes(&mut dsdt_data)?; + device_manager + .acpi_devices + .append_aml_bytes(&mut dsdt_data)?; // Architecture specific DSDT data setup_arch_dsdt(&mut dsdt_data)?; let mut dsdt = Dsdt::new(OEM_ID, *b"FCVMDSDT", OEM_REVISION, dsdt_data); - self.write_acpi_table(&mut dsdt) + self.write_acpi_table(&mut device_manager.resource_allocator, &mut dsdt) } /// Build the FADT table for the guest /// /// This includes a pointer with the location of the DSDT in guest memory - fn build_fadt(&mut self, dsdt_addr: u64) -> Result { + fn build_fadt( + &mut self, + resource_allocator: &mut ResourceAllocator, + dsdt_addr: u64, + ) -> Result { let mut fadt = Fadt::new(OEM_ID, *b"FCVMFADT", OEM_REVISION); fadt.set_hypervisor_vendor_id(HYPERVISOR_VENDOR_ID); fadt.set_x_dsdt(dsdt_addr); @@ -108,13 +112,17 @@ impl AcpiTableWriter<'_> { (1 << FADT_F_HW_REDUCED_ACPI) | (1 << FADT_F_PWR_BUTTON) | (1 << FADT_F_SLP_BUTTON), ); setup_arch_fadt(&mut fadt); - self.write_acpi_table(&mut fadt) + self.write_acpi_table(resource_allocator, &mut fadt) } /// Build the MADT table for the guest /// /// This includes information about the interrupt controllers supported in the platform - fn build_madt(&mut self, nr_vcpus: u8) -> Result { + fn build_madt( + &mut self, + resource_allocator: &mut ResourceAllocator, + nr_vcpus: u8, + ) -> Result { let mut madt = Madt::new( OEM_ID, *b"FCVMMADT", @@ -122,20 +130,25 @@ impl AcpiTableWriter<'_> { apic_addr(), setup_interrupt_controllers(nr_vcpus), ); - self.write_acpi_table(&mut madt) + self.write_acpi_table(resource_allocator, &mut madt) } /// Build the XSDT table for the guest /// /// Currently, we pass to the guest just FADT and MADT tables. - fn build_xsdt(&mut self, fadt_addr: u64, madt_addr: u64) -> Result { + fn build_xsdt( + &mut self, + resource_allocator: &mut ResourceAllocator, + fadt_addr: u64, + madt_addr: u64, + ) -> Result { let mut xsdt = Xsdt::new( OEM_ID, *b"FCMVXSDT", OEM_REVISION, vec![fadt_addr, madt_addr], ); - self.write_acpi_table(&mut xsdt) + self.write_acpi_table(resource_allocator, &mut xsdt) } /// Build the RSDP pointer for the guest. @@ -163,20 +176,19 @@ impl AcpiTableWriter<'_> { /// such as interrupt controllers, vCPUs and VirtIO devices. pub(crate) fn create_acpi_tables( mem: &GuestMemoryMmap, - resource_allocator: &mut ResourceAllocator, - mmio_device_manager: &MMIODeviceManager, - acpi_device_manager: &ACPIDeviceManager, + device_manager: &mut DeviceManager, vcpus: &[Vcpu], ) -> Result<(), AcpiError> { - let mut writer = AcpiTableWriter { - mem, - resource_allocator, - }; - - let dsdt_addr = writer.build_dsdt(mmio_device_manager, acpi_device_manager)?; - let fadt_addr = writer.build_fadt(dsdt_addr)?; - let madt_addr = writer.build_madt(vcpus.len().try_into().unwrap())?; - let xsdt_addr = writer.build_xsdt(fadt_addr, madt_addr)?; + let mut writer = AcpiTableWriter { mem }; + + let dsdt_addr = writer.build_dsdt(device_manager)?; + let fadt_addr = writer.build_fadt(&mut device_manager.resource_allocator, dsdt_addr)?; + let madt_addr = writer.build_madt( + &mut device_manager.resource_allocator, + vcpus.len().try_into().unwrap(), + )?; + let xsdt_addr = + writer.build_xsdt(&mut device_manager.resource_allocator, fadt_addr, madt_addr)?; writer.build_rsdp(xsdt_addr) } @@ -218,17 +230,20 @@ mod tests { let mut vmm = default_vmm(); let mut writer = AcpiTableWriter { mem: vmm.vm.guest_memory(), - resource_allocator: &mut vmm.resource_allocator, }; // This should succeed let mut sdt = MockSdt(vec![0; 4096]); - let addr = writer.write_acpi_table(&mut sdt).unwrap(); + let addr = writer + .write_acpi_table(&mut vmm.device_manager.resource_allocator, &mut sdt) + .unwrap(); assert_eq!(addr, SYSTEM_MEM_START); // Let's try to write two 4K pages plus one byte let mut sdt = MockSdt(vec![0; usize::try_from(SYSTEM_MEM_SIZE + 1).unwrap()]); - let err = writer.write_acpi_table(&mut sdt).unwrap_err(); + let err = writer + .write_acpi_table(&mut vmm.device_manager.resource_allocator, &mut sdt) + .unwrap_err(); assert!( matches!( err, @@ -241,19 +256,29 @@ mod tests { // We are allocating memory for tables with alignment of 1 byte. All of these should // succeed. let mut sdt = MockSdt(vec![0; 5]); - let addr = writer.write_acpi_table(&mut sdt).unwrap(); + let addr = writer + .write_acpi_table(&mut vmm.device_manager.resource_allocator, &mut sdt) + .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4096); let mut sdt = MockSdt(vec![0; 2]); - let addr = writer.write_acpi_table(&mut sdt).unwrap(); + let addr = writer + .write_acpi_table(&mut vmm.device_manager.resource_allocator, &mut sdt) + .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4101); let mut sdt = MockSdt(vec![0; 4]); - let addr = writer.write_acpi_table(&mut sdt).unwrap(); + let addr = writer + .write_acpi_table(&mut vmm.device_manager.resource_allocator, &mut sdt) + .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4103); let mut sdt = MockSdt(vec![0; 8]); - let addr = writer.write_acpi_table(&mut sdt).unwrap(); + let addr = writer + .write_acpi_table(&mut vmm.device_manager.resource_allocator, &mut sdt) + .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4107); let mut sdt = MockSdt(vec![0; 16]); - let addr = writer.write_acpi_table(&mut sdt).unwrap(); + let addr = writer + .write_acpi_table(&mut vmm.device_manager.resource_allocator, &mut sdt) + .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4115); } @@ -268,11 +293,13 @@ mod tests { let (_, vm) = setup_vm_with_memory(u64_to_usize(SYSTEM_MEM_START + SYSTEM_MEM_SIZE - 4096)); let mut writer = AcpiTableWriter { mem: vm.guest_memory(), - resource_allocator: &mut ResourceAllocator::new().unwrap(), }; + let mut resource_allocator = ResourceAllocator::new().unwrap(); let mut sdt = MockSdt(vec![0; usize::try_from(SYSTEM_MEM_SIZE).unwrap()]); - let err = writer.write_acpi_table(&mut sdt).unwrap_err(); + let err = writer + .write_acpi_table(&mut resource_allocator, &mut sdt) + .unwrap_err(); assert!( matches!( err, diff --git a/src/vmm/src/arch/aarch64/fdt.rs b/src/vmm/src/arch/aarch64/fdt.rs index 54c8a30225c..be53ef6993d 100644 --- a/src/vmm/src/arch/aarch64/fdt.rs +++ b/src/vmm/src/arch/aarch64/fdt.rs @@ -13,6 +13,7 @@ use vm_memory::GuestMemoryError; use super::cache_info::{CacheEntry, read_cache_config}; use super::gic::GICDevice; +use crate::device_manager::DeviceManager; use crate::device_manager::mmio::MMIODeviceInfo; use crate::devices::acpi::vmgenid::{VMGENID_MEM_SIZE, VmGenId}; use crate::initrd::InitrdConfig; @@ -59,11 +60,8 @@ pub fn create_fdt( guest_mem: &GuestMemoryMmap, vcpu_mpidr: Vec, cmdline: CString, - virtio_devices: Vec<&MMIODeviceInfo>, - rtc: Option<&MMIODeviceInfo>, - serial: Option<&MMIODeviceInfo>, + device_manager: &DeviceManager, gic_device: &GICDevice, - vmgenid: &Option, initrd: &Option, ) -> Result, FdtError> { // Allocate stuff necessary for storing the blob. @@ -90,8 +88,8 @@ pub fn create_fdt( create_timer_node(&mut fdt_writer)?; create_clock_node(&mut fdt_writer)?; create_psci_node(&mut fdt_writer)?; - create_devices_node(&mut fdt_writer, virtio_devices, rtc, serial)?; - create_vmgenid_node(&mut fdt_writer, vmgenid)?; + create_devices_node(&mut fdt_writer, device_manager)?; + create_vmgenid_node(&mut fdt_writer, &device_manager.acpi_devices.vmgenid)?; // End Header node. fdt_writer.end_node(root)?; @@ -412,21 +410,21 @@ fn create_rtc_node(fdt: &mut FdtWriter, dev_info: &MMIODeviceInfo) -> Result<(), fn create_devices_node( fdt: &mut FdtWriter, - mut virtio_devices: Vec<&MMIODeviceInfo>, - rtc: Option<&MMIODeviceInfo>, - serial: Option<&MMIODeviceInfo>, + device_manager: &DeviceManager, ) -> Result<(), FdtError> { - if let Some(device_info) = rtc { - create_rtc_node(fdt, device_info)?; + if let Some(rtc_info) = device_manager.mmio_devices.rtc_device_info() { + create_rtc_node(fdt, rtc_info)?; } - if let Some(device_info) = serial { - create_serial_node(fdt, device_info)?; + if let Some(serial_info) = device_manager.mmio_devices.serial_device_info() { + create_serial_node(fdt, serial_info)?; } + let mut virtio_mmio = device_manager.mmio_devices.virtio_device_info(); + // Sort out virtio devices by address from low to high and insert them into fdt table. - virtio_devices.sort_by_key(|a| a.addr); - for ordered_device_info in virtio_devices.drain(..) { + virtio_mmio.sort_by_key(|a| a.addr); + for ordered_device_info in virtio_mmio.drain(..) { create_virtio_node(fdt, ordered_device_info)?; } @@ -436,19 +434,20 @@ fn create_devices_node( #[cfg(test)] mod tests { use std::ffi::CString; - use std::num::NonZeroU32; + use std::sync::{Arc, Mutex}; use kvm_ioctls::Kvm; + use linux_loader::cmdline as kernel_cmdline; use super::*; + use crate::EventManager; use crate::arch::aarch64::gic::create_gic; use crate::arch::aarch64::layout; - use crate::device_manager::resources::ResourceAllocator; + use crate::device_manager::mmio::tests::DummyDevice; + use crate::device_manager::tests::default_device_manager; use crate::test_utils::arch_mem; use crate::vstate::memory::GuestAddress; - const LEN: u64 = 4096; - // The `load` function from the `device_tree` will mistakenly check the actual size // of the buffer with the allocated size. This works around that. fn set_size(buf: &mut [u8], pos: usize, val: u32) { @@ -461,36 +460,37 @@ mod tests { #[test] fn test_create_fdt_with_devices() { let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); - - let serial = MMIODeviceInfo { - addr: 0x00, - irq: NonZeroU32::new(1), - len: LEN, - }; - let virtio_device = MMIODeviceInfo { - addr: LEN, - irq: NonZeroU32::new(2), - len: LEN, - }; - let rtc = MMIODeviceInfo { - addr: 2 * LEN, - irq: NonZeroU32::new(3), - len: LEN, - }; - + let mut event_manager = EventManager::new().unwrap(); + let mut device_manager = default_device_manager(); let kvm = Kvm::new().unwrap(); let vm = kvm.create_vm().unwrap(); let gic = create_gic(&vm, 1, None).unwrap(); + let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); + cmdline.insert("console", "/dev/tty0").unwrap(); + + device_manager + .attach_legacy_devices_aarch64(&vm, &mut event_manager, &mut cmdline) + .unwrap(); + let dummy = Arc::new(Mutex::new(DummyDevice::new())); + device_manager + .mmio_devices + .register_virtio_test_device( + &vm, + mem.clone(), + &mut device_manager.resource_allocator, + dummy, + &mut cmdline, + "dummy", + ) + .unwrap(); + create_fdt( &mem, vec![0], - CString::new("console=tty0").unwrap(), - vec![&virtio_device], - Some(&rtc), - Some(&serial), + cmdline.as_cstring().unwrap(), + &device_manager, &gic, &None, - &None, ) .unwrap(); } @@ -498,20 +498,21 @@ mod tests { #[test] fn test_create_fdt_with_vmgenid() { let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); - let mut resource_allocator = ResourceAllocator::new().unwrap(); - let vmgenid = VmGenId::new(&mem, &mut resource_allocator).unwrap(); + let mut device_manager = default_device_manager(); let kvm = Kvm::new().unwrap(); let vm = kvm.create_vm().unwrap(); let gic = create_gic(&vm, 1, None).unwrap(); + let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); + cmdline.insert("console", "/dev/tty0").unwrap(); + + device_manager.attach_vmgenid_device(&mem, &vm).unwrap(); + create_fdt( &mem, vec![0], CString::new("console=tty0").unwrap(), - Vec::new(), - None, - None, + &device_manager, &gic, - &Some(vmgenid), &None, ) .unwrap(); @@ -520,6 +521,7 @@ mod tests { #[test] fn test_create_fdt() { let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); + let device_manager = default_device_manager(); let kvm = Kvm::new().unwrap(); let vm = kvm.create_vm().unwrap(); let gic = create_gic(&vm, 1, None).unwrap(); @@ -534,12 +536,9 @@ mod tests { &mem, vec![0], CString::new("console=tty0").unwrap(), - Vec::new(), - None, - None, + &device_manager, &gic, &None, - &None, ) .unwrap(); @@ -579,6 +578,7 @@ mod tests { #[test] fn test_create_fdt_with_initrd() { let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); + let device_manager = default_device_manager(); let kvm = Kvm::new().unwrap(); let vm = kvm.create_vm().unwrap(); let gic = create_gic(&vm, 1, None).unwrap(); @@ -598,11 +598,8 @@ mod tests { &mem, vec![0], CString::new("console=tty0").unwrap(), - vec![], - None, - None, + &device_manager, &gic, - &None, &Some(initrd), ) .unwrap(); diff --git a/src/vmm/src/arch/aarch64/mod.rs b/src/vmm/src/arch/aarch64/mod.rs index f945601c940..6d1d0e26359 100644 --- a/src/vmm/src/arch/aarch64/mod.rs +++ b/src/vmm/src/arch/aarch64/mod.rs @@ -134,11 +134,8 @@ pub fn configure_system_for_boot( vmm.vm.guest_memory(), vcpu_mpidr, cmdline, - vmm.mmio_device_manager.virtio_device_info(), - vmm.mmio_device_manager.rtc_device_info(), - vmm.mmio_device_manager.serial_device_info(), + &vmm.device_manager, vmm.vm.get_irqchip(), - &vmm.acpi_device_manager.vmgenid, initrd, )?; diff --git a/src/vmm/src/arch/x86_64/mod.rs b/src/vmm/src/arch/x86_64/mod.rs index ca350cbf9af..c54ec46c987 100644 --- a/src/vmm/src/arch/x86_64/mod.rs +++ b/src/vmm/src/arch/x86_64/mod.rs @@ -205,7 +205,7 @@ pub fn configure_system_for_boot( // Note that this puts the mptable at the last 1k of Linux's 640k base RAM mptable::setup_mptable( vmm.vm.guest_memory(), - &mut vmm.resource_allocator, + &mut vmm.device_manager.resource_allocator, vcpu_config.vcpu_count, ) .map_err(ConfigurationError::MpTableSetup)?; @@ -226,13 +226,7 @@ pub fn configure_system_for_boot( // Create ACPI tables and write them in guest memory // For the time being we only support ACPI in x86_64 - create_acpi_tables( - vmm.vm.guest_memory(), - &mut vmm.resource_allocator, - &vmm.mmio_device_manager, - &vmm.acpi_device_manager, - vcpus, - )?; + create_acpi_tables(vmm.vm.guest_memory(), &mut vmm.device_manager, vcpus)?; Ok(()) } diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 5cba0ffbfe8..48590201f2d 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -9,14 +9,12 @@ use std::io; use std::sync::mpsc; use std::sync::{Arc, Mutex}; -use event_manager::{MutEventSubscriber, SubscriberOps}; +use event_manager::SubscriberOps; use linux_loader::cmdline::Cmdline as LoaderKernelCmdline; use userfaultfd::Uffd; use utils::time::TimestampUs; #[cfg(target_arch = "aarch64")] use vm_memory::GuestAddress; -#[cfg(target_arch = "x86_64")] -use vmm_sys_util::eventfd::EventFd; use crate::arch::{ConfigurationError, configure_system_for_boot, load_kernel}; #[cfg(target_arch = "aarch64")] @@ -24,36 +22,24 @@ use crate::construct_kvm_mpidrs; use crate::cpu_config::templates::{ GetCpuTemplate, GetCpuTemplateError, GuestConfigError, KvmCapability, }; -use crate::device_manager::acpi::ACPIDeviceManager; -#[cfg(target_arch = "x86_64")] -use crate::device_manager::legacy::PortIODeviceManager; -use crate::device_manager::mmio::{MMIODeviceManager, MmioError}; -use crate::device_manager::persist::{ - ACPIDeviceManagerConstructorArgs, ACPIDeviceManagerRestoreError, MMIODevManagerConstructorArgs, -}; -use crate::device_manager::resources::ResourceAllocator; -use crate::devices::acpi::vmgenid::{VmGenId, VmGenIdError}; -#[cfg(target_arch = "x86_64")] -use crate::devices::legacy::I8042Device; #[cfg(target_arch = "aarch64")] -use crate::devices::legacy::RTCDevice; -use crate::devices::legacy::SerialDevice; -use crate::devices::legacy::serial::SerialOut; +use crate::device_manager::AttachLegacyMmioDeviceError; +use crate::device_manager::{ + AttachMmioDeviceError, AttachVmgenidError, DeviceManager, DevicePersistError, DeviceRestoreArgs, +}; +use crate::devices::acpi::vmgenid::VmGenIdError; use crate::devices::virtio::balloon::Balloon; use crate::devices::virtio::block::device::Block; -use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::net::Net; use crate::devices::virtio::rng::Entropy; -use crate::devices::virtio::transport::mmio::{IrqTrigger, MmioTransport}; use crate::devices::virtio::vsock::{Vsock, VsockUnixBackend}; #[cfg(feature = "gdb")] use crate::gdb; use crate::initrd::{InitrdConfig, InitrdError}; -use crate::logger::{debug, error}; +use crate::logger::debug; use crate::persist::{MicrovmState, MicrovmStateError}; use crate::resources::VmResources; use crate::seccomp::BpfThreadMap; -use crate::snapshot::Persist; use crate::vmm_config::instance_info::InstanceInfo; use crate::vmm_config::machine_config::MachineConfigError; use crate::vstate::kvm::Kvm; @@ -68,7 +54,10 @@ pub enum StartMicrovmError { /// Unable to attach block device to Vmm: {0} AttachBlockDevice(io::Error), /// Unable to attach the VMGenID device: {0} - AttachVmgenidDevice(kvm_ioctls::Error), + AttachVmgenidDevice(#[from] AttachVmgenidError), + #[cfg(target_arch = "aarch64")] + /// Unable to attach legacy MMIO devices: {0} + AttachLegacyDevices(#[from] AttachLegacyMmioDeviceError), /// System configuration error: {0} ConfigureSystem(#[from] ConfigurationError), /// Failed to create guest config: {0} @@ -108,7 +97,7 @@ pub enum StartMicrovmError { /// Cannot open the block device backing file: {0} OpenBlockDevice(io::Error), /// Cannot initialize a MMIO Device or add a device to the MMIO Bus or cmdline: {0} - RegisterMmioDevice(#[from] device_manager::mmio::MmioError), + RegisterMmioDevice(#[from] device_manager::AttachMmioDeviceError), /// Cannot restore microvm state: {0} RestoreMicrovmState(MicrovmStateError), /// Cannot set vm resources: {0} @@ -145,39 +134,9 @@ fn create_vmm_and_vcpus( // Build custom CPU config if a custom template is provided. let mut vm = Vm::new(&kvm)?; - let resource_allocator = ResourceAllocator::new()?; - - // Instantiate the MMIO device manager. - let mmio_device_manager = MMIODeviceManager::new(); - - // Instantiate ACPI device manager. - let acpi_device_manager = ACPIDeviceManager::new(); - let (vcpus, vcpus_exit_evt) = vm.create_vcpus(vcpu_count)?; - #[cfg(target_arch = "x86_64")] - let pio_device_manager = { - // Make stdout non blocking. - set_stdout_nonblocking(); - - // Serial device setup. - let serial_device = setup_serial_device(event_manager)?; - - // x86_64 uses the i8042 reset event as the Vmm exit event. - let reset_evt = vcpus_exit_evt.try_clone().map_err(VmmError::EventFd)?; - let i8042 = Arc::new(Mutex::new(I8042Device::new( - reset_evt, - EventFd::new(libc::EFD_NONBLOCK).map_err(VmmError::EventFd)?, - ))); - - // create pio dev manager with legacy devices - let mut pio_dev_mgr = - PortIODeviceManager::new(serial_device, i8042).map_err(VmmError::LegacyIOBus)?; - pio_dev_mgr - .register_devices(vm.fd()) - .map_err(VmmError::LegacyIOBus)?; - pio_dev_mgr - }; + let device_manager = DeviceManager::new(event_manager, &vcpus_exit_evt, vm.fd())?; let vmm = Vmm { events_observer: Some(std::io::stdin()), @@ -188,11 +147,7 @@ fn create_vmm_and_vcpus( uffd: None, vcpus_handles: Vec::new(), vcpus_exit_evt, - resource_allocator, - mmio_device_manager, - #[cfg(target_arch = "x86_64")] - pio_device_manager, - acpi_device_manager, + device_manager, }; Ok((vmm, vcpus)) @@ -263,7 +218,7 @@ pub fn build_microvm_for_boot( // to maintain the same MMIO address referenced in the documentation // and tests. if vm_resources.boot_timer { - attach_boot_timer_device(&mut vmm, request_ts)?; + vmm.device_manager.attach_boot_timer_device(request_ts)?; } if let Some(balloon) = vm_resources.balloon.get() { @@ -292,9 +247,14 @@ pub fn build_microvm_for_boot( } #[cfg(target_arch = "aarch64")] - attach_legacy_devices_aarch64(event_manager, &mut vmm, &mut boot_cmdline)?; + vmm.device_manager.attach_legacy_devices_aarch64( + vmm.vm.fd(), + event_manager, + &mut boot_cmdline, + )?; - attach_vmgenid_device(&mut vmm)?; + vmm.device_manager + .attach_vmgenid_device(vmm.vm.guest_memory(), vmm.vm.fd())?; #[cfg(target_arch = "aarch64")] if vcpus[0].kvm_vcpu.supports_pvtime() { @@ -413,10 +373,8 @@ pub enum BuildMicrovmFromSnapshotError { MissingVmmSeccompFilters, /// Failed to apply VMM secccomp filter: {0} SeccompFiltersInternal(#[from] crate::seccomp::InstallationError), - /// Failed to restore ACPI device manager: {0} - ACPIDeviManager(#[from] ACPIDeviceManagerRestoreError), - /// VMGenID update failed: {0} - VMGenIDUpdate(std::io::Error), + /// Failed to restore devices: {0} + RestoreDevices(#[from] DevicePersistError), } /// Builds and starts a microVM based on the provided MicrovmState. @@ -496,38 +454,17 @@ pub fn build_microvm_from_snapshot( vm_resources.boot_source.config = microvm_state.vm_info.boot_source; // Restore devices states. - let mmio_ctor_args = MMIODevManagerConstructorArgs { + let device_ctor_args = DeviceRestoreArgs { mem: vmm.vm.guest_memory(), vm: vmm.vm.fd(), event_manager, - resource_allocator: &mut vmm.resource_allocator, vm_resources, instance_id: &instance_info.id, restored_from_file: vmm.uffd.is_none(), }; - vmm.mmio_device_manager = - MMIODeviceManager::restore(mmio_ctor_args, µvm_state.device_states) - .map_err(MicrovmStateError::RestoreDevices)?; - vmm.emulate_serial_init()?; - - { - let acpi_ctor_args = ACPIDeviceManagerConstructorArgs { - mem: vmm.vm.guest_memory(), - resource_allocator: &mut vmm.resource_allocator, - vm: vmm.vm.fd(), - }; - - vmm.acpi_device_manager = - ACPIDeviceManager::restore(acpi_ctor_args, µvm_state.acpi_dev_state)?; - - // Inject the notification to VMGenID that we have resumed from a snapshot. - // This needs to happen before we resume vCPUs, so that we minimize the time between vCPUs - // resuming and notification being handled by the driver. - vmm.acpi_device_manager - .notify_vmgenid() - .map_err(BuildMicrovmFromSnapshotError::VMGenIDUpdate)?; - } + vmm.device_manager + .restore(µvm_state.device_states, device_ctor_args)?; // Move vcpus to their own threads and start their state machine in the 'Paused' state. vmm.start_vcpus( @@ -553,18 +490,6 @@ pub fn build_microvm_from_snapshot( Ok(vmm) } -/// Sets up the serial device. -pub fn setup_serial_device( - event_manager: &mut EventManager, -) -> Result>, VmmError> { - let serial = Arc::new(Mutex::new( - SerialDevice::new(Some(std::io::stdin()), SerialOut::Stdout(std::io::stdout())) - .map_err(VmmError::EventFd)?, - )); - event_manager.add_subscriber(serial.clone()); - Ok(serial) -} - /// 64 bytes due to alignment requirement in 3.1 of https://www.kernel.org/doc/html/v5.8/virt/kvm/devices/vcpu.html#attribute-kvm-arm-vcpu-pvtime-ipa #[cfg(target_arch = "aarch64")] const STEALTIME_STRUCT_MEM_SIZE: u64 = 64; @@ -578,6 +503,7 @@ fn allocate_pvtime_region( ) -> Result { let size = STEALTIME_STRUCT_MEM_SIZE * vcpu_count as u64; let addr = vmm + .device_manager .resource_allocator .allocate_system_memory(size, STEALTIME_STRUCT_MEM_SIZE, policy) .map_err(StartMicrovmError::AllocateResources)?; @@ -603,110 +529,22 @@ fn setup_pvtime(vmm: &mut Vmm, vcpus: &mut [Vcpu]) -> Result<(), StartMicrovmErr Ok(()) } -#[cfg(target_arch = "aarch64")] -fn attach_legacy_devices_aarch64( - event_manager: &mut EventManager, - vmm: &mut Vmm, - cmdline: &mut LoaderKernelCmdline, -) -> Result<(), VmmError> { - // Serial device setup. - let cmdline_contains_console = cmdline - .as_cstring() - .map_err(|_| VmmError::Cmdline)? - .into_string() - .map_err(|_| VmmError::Cmdline)? - .contains("console="); - - if cmdline_contains_console { - // Make stdout non-blocking. - set_stdout_nonblocking(); - let serial = Arc::new(Mutex::new( - SerialDevice::new(Some(std::io::stdin()), SerialOut::Stdout(std::io::stdout())) - .map_err(VmmError::EventFd)?, - )); - event_manager.add_subscriber(serial.clone()); - vmm.mmio_device_manager - .register_mmio_serial(vmm.vm.fd(), &mut vmm.resource_allocator, serial, None) - .map_err(VmmError::RegisterMMIODevice)?; - vmm.mmio_device_manager - .add_mmio_serial_to_cmdline(cmdline) - .map_err(VmmError::RegisterMMIODevice)?; - } - - let rtc = RTCDevice::new(); - vmm.mmio_device_manager - .register_mmio_rtc(&mut vmm.resource_allocator, rtc, None) - .map_err(VmmError::RegisterMMIODevice) -} - -/// Attaches a VirtioDevice device to the device manager and event manager. -fn attach_virtio_device( - event_manager: &mut EventManager, - vmm: &mut Vmm, - id: String, - device: Arc>, - cmdline: &mut LoaderKernelCmdline, - is_vhost_user: bool, -) -> Result<(), MmioError> { - event_manager.add_subscriber(device.clone()); - - let interrupt = Arc::new(IrqTrigger::new()); - // The device mutex mustn't be locked here otherwise it will deadlock. - let device = MmioTransport::new( - vmm.vm.guest_memory().clone(), - interrupt, - device, - is_vhost_user, - ); - vmm.mmio_device_manager - .register_mmio_virtio_for_boot( - vmm.vm.fd(), - &mut vmm.resource_allocator, - id, - device, - cmdline, - ) - .map(|_| ()) -} - -pub(crate) fn attach_boot_timer_device( - vmm: &mut Vmm, - request_ts: TimestampUs, -) -> Result<(), MmioError> { - let boot_timer = crate::devices::pseudo::BootTimer::new(request_ts); - - vmm.mmio_device_manager - .register_mmio_boot_timer(&mut vmm.resource_allocator, boot_timer)?; - - Ok(()) -} - -fn attach_vmgenid_device(vmm: &mut Vmm) -> Result<(), StartMicrovmError> { - let vmgenid = VmGenId::new(vmm.vm.guest_memory(), &mut vmm.resource_allocator) - .map_err(StartMicrovmError::CreateVMGenID)?; - - vmm.acpi_device_manager - .attach_vmgenid(vmgenid, vmm.vm.fd()) - .map_err(StartMicrovmError::AttachVmgenidDevice)?; - - Ok(()) -} - fn attach_entropy_device( vmm: &mut Vmm, cmdline: &mut LoaderKernelCmdline, entropy_device: &Arc>, event_manager: &mut EventManager, -) -> Result<(), MmioError> { +) -> Result<(), AttachMmioDeviceError> { let id = entropy_device .lock() .expect("Poisoned lock") .id() .to_string(); - attach_virtio_device( - event_manager, - vmm, + event_manager.add_subscriber(entropy_device.clone()); + vmm.device_manager.attach_virtio_device( + vmm.vm.guest_memory(), + vmm.vm.fd(), id, entropy_device.clone(), cmdline, @@ -736,9 +574,10 @@ fn attach_block_devices<'a, I: Iterator>> + Debug>( (locked.id().to_string(), locked.is_vhost_user()) }; // The device mutex mustn't be locked here otherwise it will deadlock. - attach_virtio_device( - event_manager, - vmm, + event_manager.add_subscriber(block.clone()); + vmm.device_manager.attach_virtio_device( + vmm.vm.guest_memory(), + vmm.vm.fd(), id, block.clone(), cmdline, @@ -756,8 +595,16 @@ fn attach_net_devices<'a, I: Iterator>> + Debug>( ) -> Result<(), StartMicrovmError> { for net_device in net_devices { let id = net_device.lock().expect("Poisoned lock").id().clone(); + event_manager.add_subscriber(net_device.clone()); // The device mutex mustn't be locked here otherwise it will deadlock. - attach_virtio_device(event_manager, vmm, id, net_device.clone(), cmdline, false)?; + vmm.device_manager.attach_virtio_device( + vmm.vm.guest_memory(), + vmm.vm.fd(), + id, + net_device.clone(), + cmdline, + false, + )?; } Ok(()) } @@ -767,10 +614,18 @@ fn attach_unixsock_vsock_device( cmdline: &mut LoaderKernelCmdline, unix_vsock: &Arc>>, event_manager: &mut EventManager, -) -> Result<(), MmioError> { +) -> Result<(), AttachMmioDeviceError> { let id = String::from(unix_vsock.lock().expect("Poisoned lock").id()); + event_manager.add_subscriber(unix_vsock.clone()); // The device mutex mustn't be locked here otherwise it will deadlock. - attach_virtio_device(event_manager, vmm, id, unix_vsock.clone(), cmdline, false) + vmm.device_manager.attach_virtio_device( + vmm.vm.guest_memory(), + vmm.vm.fd(), + id, + unix_vsock.clone(), + cmdline, + false, + ) } fn attach_balloon_device( @@ -778,38 +633,28 @@ fn attach_balloon_device( cmdline: &mut LoaderKernelCmdline, balloon: &Arc>, event_manager: &mut EventManager, -) -> Result<(), MmioError> { +) -> Result<(), AttachMmioDeviceError> { let id = String::from(balloon.lock().expect("Poisoned lock").id()); + event_manager.add_subscriber(balloon.clone()); // The device mutex mustn't be locked here otherwise it will deadlock. - attach_virtio_device(event_manager, vmm, id, balloon.clone(), cmdline, false) -} - -// Adds `O_NONBLOCK` to the stdout flags. -pub(crate) fn set_stdout_nonblocking() { - // SAFETY: Call is safe since parameters are valid. - let flags = unsafe { libc::fcntl(libc::STDOUT_FILENO, libc::F_GETFL, 0) }; - if flags < 0 { - error!("Could not get Firecracker stdout flags."); - } - // SAFETY: Call is safe since parameters are valid. - let rc = unsafe { libc::fcntl(libc::STDOUT_FILENO, libc::F_SETFL, flags | libc::O_NONBLOCK) }; - if rc < 0 { - error!("Could not set Firecracker stdout to non-blocking."); - } + vmm.device_manager.attach_virtio_device( + vmm.vm.guest_memory(), + vmm.vm.fd(), + id, + balloon.clone(), + cmdline, + false, + ) } #[cfg(test)] pub(crate) mod tests { use linux_loader::cmdline::Cmdline; - #[cfg(target_arch = "x86_64")] - use vmm_sys_util::eventfd::EventFd; use vmm_sys_util::tempfile::TempFile; use super::*; - use crate::device_manager::resources::ResourceAllocator; - #[cfg(target_arch = "x86_64")] - use crate::devices::legacy::serial::SerialOut; + use crate::device_manager::tests::default_device_manager; use crate::devices::virtio::block::CacheType; use crate::devices::virtio::rng::device::ENTROPY_DEV_ID; use crate::devices::virtio::vsock::{TYPE_VSOCK, VSOCK_DEV_ID}; @@ -882,20 +727,6 @@ pub(crate) mod tests { pub(crate) fn default_vmm() -> Vmm { let (kvm, mut vm) = setup_vm_with_memory(mib_to_bytes(128)); - let mmio_device_manager = MMIODeviceManager::new(); - let acpi_device_manager = ACPIDeviceManager::new(); - #[cfg(target_arch = "x86_64")] - let pio_device_manager = PortIODeviceManager::new( - Arc::new(Mutex::new( - SerialDevice::new(None, SerialOut::Sink(std::io::sink())).unwrap(), - )), - Arc::new(Mutex::new(I8042Device::new( - EventFd::new(libc::EFD_NONBLOCK).unwrap(), - EventFd::new(libc::EFD_NONBLOCK).unwrap(), - ))), - ) - .unwrap(); - let (_, vcpus_exit_evt) = vm.create_vcpus(1).unwrap(); Vmm { @@ -907,11 +738,7 @@ pub(crate) mod tests { uffd: None, vcpus_handles: Vec::new(), vcpus_exit_evt, - resource_allocator: ResourceAllocator::new().unwrap(), - mmio_device_manager, - #[cfg(target_arch = "x86_64")] - pio_device_manager, - acpi_device_manager, + device_manager: default_device_manager(), } } @@ -1007,7 +834,8 @@ pub(crate) mod tests { attach_unixsock_vsock_device(vmm, cmdline, &vsock, event_manager).unwrap(); assert!( - vmm.mmio_device_manager + vmm.device_manager + .mmio_devices .get_virtio_device(TYPE_VSOCK, &vsock_dev_id) .is_some() ); @@ -1025,7 +853,8 @@ pub(crate) mod tests { attach_entropy_device(vmm, cmdline, &entropy, event_manager).unwrap(); assert!( - vmm.mmio_device_manager + vmm.device_manager + .mmio_devices .get_virtio_device(TYPE_RNG, ENTROPY_DEV_ID) .is_some() ); @@ -1033,8 +862,10 @@ pub(crate) mod tests { #[cfg(target_arch = "x86_64")] pub(crate) fn insert_vmgenid_device(vmm: &mut Vmm) { - attach_vmgenid_device(vmm).unwrap(); - assert!(vmm.acpi_device_manager.vmgenid.is_some()); + vmm.device_manager + .attach_vmgenid_device(vmm.vm.guest_memory(), vmm.vm.fd()) + .unwrap(); + assert!(vmm.device_manager.acpi_devices.vmgenid.is_some()); } pub(crate) fn insert_balloon_device( @@ -1050,7 +881,8 @@ pub(crate) mod tests { attach_balloon_device(vmm, cmdline, balloon, event_manager).unwrap(); assert!( - vmm.mmio_device_manager + vmm.device_manager + .mmio_devices .get_virtio_device(TYPE_BALLOON, BALLOON_DEV_ID) .is_some() ); @@ -1101,7 +933,8 @@ pub(crate) mod tests { insert_block_devices(&mut vmm, &mut cmdline, &mut event_manager, block_configs); assert!(cmdline_contains(&cmdline, "root=/dev/vda ro")); assert!( - vmm.mmio_device_manager + vmm.device_manager + .mmio_devices .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); @@ -1122,7 +955,8 @@ pub(crate) mod tests { insert_block_devices(&mut vmm, &mut cmdline, &mut event_manager, block_configs); assert!(cmdline_contains(&cmdline, "root=PARTUUID=0eaa91a0-01 rw")); assert!( - vmm.mmio_device_manager + vmm.device_manager + .mmio_devices .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); @@ -1144,7 +978,8 @@ pub(crate) mod tests { assert!(!cmdline_contains(&cmdline, "root=PARTUUID=")); assert!(!cmdline_contains(&cmdline, "root=/dev/vda")); assert!( - vmm.mmio_device_manager + vmm.device_manager + .mmio_devices .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); @@ -1181,17 +1016,20 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=PARTUUID=0eaa91a0-01 rw")); assert!( - vmm.mmio_device_manager + vmm.device_manager + .mmio_devices .get_virtio_device(TYPE_BLOCK, "root") .is_some() ); assert!( - vmm.mmio_device_manager + vmm.device_manager + .mmio_devices .get_virtio_device(TYPE_BLOCK, "secondary") .is_some() ); assert!( - vmm.mmio_device_manager + vmm.device_manager + .mmio_devices .get_virtio_device(TYPE_BLOCK, "third") .is_some() ); @@ -1220,7 +1058,8 @@ pub(crate) mod tests { insert_block_devices(&mut vmm, &mut cmdline, &mut event_manager, block_configs); assert!(cmdline_contains(&cmdline, "root=/dev/vda rw")); assert!( - vmm.mmio_device_manager + vmm.device_manager + .mmio_devices .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); @@ -1241,7 +1080,8 @@ pub(crate) mod tests { insert_block_devices(&mut vmm, &mut cmdline, &mut event_manager, block_configs); assert!(cmdline_contains(&cmdline, "root=PARTUUID=0eaa91a0-01 ro")); assert!( - vmm.mmio_device_manager + vmm.device_manager + .mmio_devices .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); @@ -1262,7 +1102,8 @@ pub(crate) mod tests { insert_block_devices(&mut vmm, &mut cmdline, &mut event_manager, block_configs); assert!(cmdline_contains(&cmdline, "root=/dev/vda rw")); assert!( - vmm.mmio_device_manager + vmm.device_manager + .mmio_devices .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); @@ -1274,9 +1115,9 @@ pub(crate) mod tests { let mut vmm = default_vmm(); let request_ts = TimestampUs::default(); - let res = attach_boot_timer_device(&mut vmm, request_ts); + let res = vmm.device_manager.attach_boot_timer_device(request_ts); res.unwrap(); - assert!(vmm.mmio_device_manager.boot_timer.is_some()); + assert!(vmm.device_manager.mmio_devices.boot_timer.is_some()); } #[test] diff --git a/src/vmm/src/device_manager/legacy.rs b/src/vmm/src/device_manager/legacy.rs index 0af1ae3348a..a2866f14415 100644 --- a/src/vmm/src/device_manager/legacy.rs +++ b/src/vmm/src/device_manager/legacy.rs @@ -33,7 +33,6 @@ pub enum LegacyDeviceError { /// The `LegacyDeviceManger` should be initialized only by using the constructor. #[derive(Debug)] pub struct PortIODeviceManager { - pub io_bus: Arc, // BusDevice::Serial pub stdio_serial: Arc>, // BusDevice::I8042Device @@ -75,7 +74,6 @@ impl PortIODeviceManager { stdio_serial: Arc>, i8042: Arc>, ) -> Result { - let io_bus = Arc::new(vm_device::Bus::new()); let com_evt_1_3 = stdio_serial .lock() .expect("Poisoned lock") @@ -90,7 +88,6 @@ impl PortIODeviceManager { .try_clone()?; Ok(PortIODeviceManager { - io_bus, stdio_serial, i8042, com_evt_1_3, @@ -100,7 +97,11 @@ impl PortIODeviceManager { } /// Register supported legacy devices. - pub fn register_devices(&mut self, vm_fd: &VmFd) -> Result<(), LegacyDeviceError> { + pub fn register_devices( + &mut self, + io_bus: &vm_device::Bus, + vm_fd: &VmFd, + ) -> Result<(), LegacyDeviceError> { let serial_2_4 = Arc::new(Mutex::new(SerialDevice { serial: Serial::with_events( self.com_evt_2_4.try_clone()?.try_clone()?, @@ -121,27 +122,27 @@ impl PortIODeviceManager { ), input: None, })); - self.io_bus.insert( + io_bus.insert( self.stdio_serial.clone(), Self::SERIAL_PORT_ADDRESSES[0], Self::SERIAL_PORT_SIZE, )?; - self.io_bus.insert( + io_bus.insert( serial_2_4.clone(), Self::SERIAL_PORT_ADDRESSES[1], Self::SERIAL_PORT_SIZE, )?; - self.io_bus.insert( + io_bus.insert( serial_1_3, Self::SERIAL_PORT_ADDRESSES[2], Self::SERIAL_PORT_SIZE, )?; - self.io_bus.insert( + io_bus.insert( serial_2_4, Self::SERIAL_PORT_ADDRESSES[3], Self::SERIAL_PORT_SIZE, )?; - self.io_bus.insert( + io_bus.insert( self.i8042.clone(), Self::I8042_KDB_DATA_REGISTER_ADDRESS, Self::I8042_KDB_DATA_REGISTER_SIZE, @@ -245,6 +246,7 @@ mod tests { #[test] fn test_register_legacy_devices() { let (_, vm) = setup_vm_with_memory(0x1000); + let io_bus = vm_device::Bus::new(); vm.setup_irqchip().unwrap(); let mut ldm = PortIODeviceManager::new( Arc::new(Mutex::new(SerialDevice { @@ -263,6 +265,6 @@ mod tests { ))), ) .unwrap(); - ldm.register_devices(vm.fd()).unwrap(); + ldm.register_devices(&io_bus, vm.fd()).unwrap(); } } diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index add6e200954..d9ba26015b6 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -127,7 +127,6 @@ pub struct MMIODevice { /// Manages the complexities of registering a MMIO device. #[derive(Debug)] pub struct MMIODeviceManager { - pub(crate) bus: Arc, /// VirtIO devices using an MMIO transport layer pub(crate) virtio_devices: HashMap<(u32, String), MMIODevice>, /// Boot timer device @@ -153,7 +152,6 @@ impl MMIODeviceManager { /// Create a new DeviceManager handling mmio devices (virtio net, block). pub fn new() -> MMIODeviceManager { MMIODeviceManager { - bus: Arc::new(vm_device::Bus::new()), virtio_devices: HashMap::new(), boot_timer: None, #[cfg(target_arch = "aarch64")] @@ -194,21 +192,20 @@ impl MMIODeviceManager { &mut self, vm: &VmFd, device_id: String, - mmio_device: MmioTransport, - device_info: &MMIODeviceInfo, + mmio_bus: &vm_device::Bus, + device: MMIODevice, ) -> Result<(), MmioError> { // Our virtio devices are currently hardcoded to use a single IRQ. // Validate that requirement. - let Some(irq) = device_info.irq else { - return Err(MmioError::InvalidIrqConfig); - }; + let irq = device.resources.irq.ok_or(MmioError::InvalidIrqConfig)?; let identifier; { + let mmio_device = device.inner.lock().expect("Poisoned lock"); let locked_device = mmio_device.locked_device(); identifier = (locked_device.device_type(), device_id); for (i, queue_evt) in locked_device.queue_events().iter().enumerate() { let io_addr = IoEventAddress::Mmio( - device_info.addr + u64::from(crate::devices::virtio::NOTIFY_REG_OFFSET), + device.resources.addr + u64::from(crate::devices::virtio::NOTIFY_REG_OFFSET), ); vm.register_ioevent(queue_evt, &io_addr, u32::try_from(i).unwrap()) .map_err(MmioError::RegisterIoEvent)?; @@ -217,16 +214,12 @@ impl MMIODeviceManager { .map_err(MmioError::RegisterIrqFd)?; } - let device = Arc::new(Mutex::new(mmio_device)); - self.bus - .insert(device.clone(), device_info.addr, device_info.len)?; - self.virtio_devices.insert( - identifier, - MMIODevice { - resources: *device_info, - inner: device, - }, - ); + mmio_bus.insert( + device.inner.clone(), + device.resources.addr, + device.resources.len, + )?; + self.virtio_devices.insert(identifier, device); Ok(()) } @@ -259,24 +252,29 @@ impl MMIODeviceManager { vm: &VmFd, resource_allocator: &mut ResourceAllocator, device_id: String, + mmio_bus: &vm_device::Bus, mmio_device: MmioTransport, _cmdline: &mut kernel_cmdline::Cmdline, - ) -> Result { - let device_info = self.allocate_mmio_resources(resource_allocator, 1)?; - self.register_mmio_virtio(vm, device_id, mmio_device, &device_info)?; + ) -> Result<(), MmioError> { + let device = MMIODevice { + resources: self.allocate_mmio_resources(resource_allocator, 1)?, + inner: Arc::new(Mutex::new(mmio_device)), + }; + #[cfg(target_arch = "x86_64")] { - Self::add_virtio_device_to_cmdline(_cmdline, &device_info)?; + Self::add_virtio_device_to_cmdline(_cmdline, &device.resources)?; add_virtio_aml( &mut self.dsdt_data, - device_info.addr, - device_info.len, + device.resources.addr, + device.resources.len, // We are sure that `irqs` has at least one element; allocate_mmio_resources makes // sure of it. - device_info.irq.unwrap().get(), + device.resources.irq.unwrap().get(), )?; } - Ok(device_info) + self.register_mmio_virtio(vm, device_id, mmio_bus, device)?; + Ok(()) } #[cfg(target_arch = "aarch64")] @@ -285,6 +283,7 @@ impl MMIODeviceManager { pub fn register_mmio_serial( &mut self, vm: &VmFd, + mmio_bus: &vm_device::Bus, resource_allocator: &mut ResourceAllocator, serial: Arc>, device_info_opt: Option, @@ -303,31 +302,35 @@ impl MMIODeviceManager { ) .map_err(MmioError::RegisterIrqFd)?; - self.bus - .insert(serial.clone(), device_info.addr, device_info.len)?; - self.serial = Some(MMIODevice { + let device = MMIODevice { resources: device_info, inner: serial, - }); + }; + + mmio_bus.insert( + device.inner.clone(), + device.resources.addr, + device.resources.len, + )?; + + self.serial = Some(device); Ok(()) } #[cfg(target_arch = "aarch64")] /// Append the registered early console to the kernel cmdline. + /// + /// This assumes that the device has been registered with the device manager. pub fn add_mmio_serial_to_cmdline( &self, cmdline: &mut kernel_cmdline::Cmdline, ) -> Result<(), MmioError> { - match &self.serial { - Some(device) => { - cmdline.insert( - "earlycon", - &format!("uart,mmio,0x{:08x}", device.resources.addr), - )?; - Ok(()) - } - None => Err(MmioError::DeviceNotFound), - } + let device = self.serial.as_ref().unwrap(); + cmdline.insert( + "earlycon", + &format!("uart,mmio,0x{:08x}", device.resources.addr), + )?; + Ok(()) } #[cfg(target_arch = "aarch64")] @@ -335,11 +338,11 @@ impl MMIODeviceManager { /// given as parameter, otherwise allocate a new MMIO resources for it. pub fn register_mmio_rtc( &mut self, + mmio_bus: &vm_device::Bus, resource_allocator: &mut ResourceAllocator, - rtc: RTCDevice, + rtc: Arc>, device_info_opt: Option, ) -> Result<(), MmioError> { - let device = Arc::new(Mutex::new(rtc)); // Create a new MMIODeviceInfo object on boot path or unwrap the // existing object on restore path. let device_info = if let Some(device_info) = device_info_opt { @@ -348,32 +351,41 @@ impl MMIODeviceManager { self.allocate_mmio_resources(resource_allocator, 1)? }; - self.bus - .insert(device.clone(), device_info.addr, device_info.len)?; - self.rtc = Some(MMIODevice { + let device = MMIODevice { resources: device_info, - inner: device, - }); + inner: rtc, + }; + mmio_bus.insert( + device.inner.clone(), + device.resources.addr, + device.resources.len, + )?; + self.rtc = Some(device); Ok(()) } /// Register a boot timer device. pub fn register_mmio_boot_timer( &mut self, + mmio_bus: &vm_device::Bus, resource_allocator: &mut ResourceAllocator, - boot_timer: BootTimer, + boot_timer: Arc>, ) -> Result<(), MmioError> { // Attach a new boot timer device. let device_info = self.allocate_mmio_resources(resource_allocator, 0)?; - - let device = Arc::new(Mutex::new(boot_timer)); - self.bus - .insert(device.clone(), device_info.addr, device_info.len)?; - self.boot_timer = Some(MMIODevice { + let device = MMIODevice { resources: device_info, - inner: device, - }); + inner: boot_timer, + }; + + mmio_bus.insert( + device.inner.clone(), + device.resources.addr, + device.resources.len, + )?; + self.boot_timer = Some(device); + Ok(()) } @@ -517,7 +529,7 @@ impl MMIODeviceManager { } #[cfg(test)] -mod tests { +pub(crate) mod tests { use std::ops::Deref; use std::sync::Arc; @@ -538,7 +550,7 @@ mod tests { const QUEUE_SIZES: &[u16] = &[64]; impl MMIODeviceManager { - fn register_virtio_test_device( + pub(crate) fn register_virtio_test_device( &mut self, vm: &VmFd, guest_mem: GuestMemoryMmap, @@ -548,15 +560,21 @@ mod tests { dev_id: &str, ) -> Result { let interrupt = Arc::new(IrqTrigger::new()); - let mmio_device = MmioTransport::new(guest_mem, interrupt, device, false); - let device_info = self.register_mmio_virtio_for_boot( + let mmio_bus = vm_device::Bus::new(); + let mmio_device = MmioTransport::new(guest_mem, interrupt, device.clone(), false); + self.register_mmio_virtio_for_boot( vm, resource_allocator, dev_id.to_string(), + &mmio_bus, mmio_device, cmdline, )?; - Ok(device_info.addr) + Ok(self + .get_virtio_device(device.lock().unwrap().device_type(), dev_id) + .unwrap() + .resources + .addr) } #[cfg(target_arch = "x86_64")] @@ -571,7 +589,7 @@ mod tests { #[allow(dead_code)] #[derive(Debug)] - struct DummyDevice { + pub(crate) struct DummyDevice { dummy: u32, queues: Vec, queue_evts: [EventFd; 1], diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index bc16604b645..8aec41ffa11 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -5,6 +5,38 @@ // Use of this source code is governed by a BSD-style license that can be // found in the THIRD-PARTY file. +use std::fmt::Debug; +use std::sync::{Arc, Mutex}; + +use acpi::ACPIDeviceManager; +use event_manager::{MutEventSubscriber, SubscriberOps}; +use kvm_ioctls::VmFd; +#[cfg(target_arch = "x86_64")] +use legacy::{LegacyDeviceError, PortIODeviceManager}; +use linux_loader::loader::Cmdline; +use log::error; +use mmio::{MMIODeviceManager, MmioError}; +use persist::{ACPIDeviceManagerConstructorArgs, MMIODevManagerConstructorArgs}; +use resources::ResourceAllocator; +use serde::{Deserialize, Serialize}; +use utils::time::TimestampUs; +use vmm_sys_util::eventfd::EventFd; + +use crate::devices::acpi::vmgenid::{VmGenId, VmGenIdError}; +#[cfg(target_arch = "x86_64")] +use crate::devices::legacy::I8042Device; +#[cfg(target_arch = "aarch64")] +use crate::devices::legacy::RTCDevice; +use crate::devices::legacy::serial::SerialOut; +use crate::devices::legacy::{IER_RDA_BIT, IER_RDA_OFFSET, SerialDevice}; +use crate::devices::pseudo::BootTimer; +use crate::devices::virtio::device::VirtioDevice; +use crate::devices::virtio::transport::mmio::{IrqTrigger, MmioTransport}; +use crate::resources::VmResources; +use crate::snapshot::Persist; +use crate::vstate::memory::GuestMemoryMmap; +use crate::{EmulateSerialInitError, EventManager}; + /// ACPI device manager. pub mod acpi; /// Legacy Device Manager. @@ -15,3 +47,422 @@ pub mod mmio; pub mod persist; /// Resource manager for devices. pub mod resources; + +#[derive(Debug, thiserror::Error, displaydoc::Display)] +/// Error while creating a new [`DeviceManager`] +pub enum DeviceManagerCreateError { + /// Error with EventFd: {0} + EventFd(#[from] std::io::Error), + #[cfg(target_arch = "x86_64")] + /// Legacy device manager error: {0} + PortIOError(#[from] LegacyDeviceError), + /// Resource allocator error: {0} + ResourceAllocator(#[from] vm_allocator::Error), +} + +#[derive(Debug, thiserror::Error, displaydoc::Display)] +/// Error while attaching a VirtIO device +pub enum AttachMmioDeviceError { + /// MMIO transport error: {0} + MmioTransport(#[from] MmioError), + /// Error inserting device in bus: {0} + Bus(#[from] vm_device::BusError), +} + +#[derive(Debug, thiserror::Error, displaydoc::Display)] +/// Error while attaching the VMGenID device +pub enum AttachVmgenidError { + /// Error creating VMGenID device: {0} + CreateVmGenID(#[from] VmGenIdError), + /// Error while registering VMGenID with KVM: {0} + AttachVmGenID(#[from] kvm_ioctls::Error), +} + +#[cfg(target_arch = "aarch64")] +#[derive(Debug, thiserror::Error, displaydoc::Display)] +/// Error while attaching the VMGenID device +pub enum AttachLegacyMmioDeviceError { + /// Cmdline error + Cmdline, + /// Error creating serial device: {0} + CreateSerial(#[from] std::io::Error), + /// Error registering device: {0} + RegisterMMIODevice(#[from] MmioError), + /// Error inserting device in the Bus: {0} + Bus(#[from] vm_device::BusError), +} + +#[derive(Debug)] +/// A manager of all peripheral devices of Firecracker +pub struct DeviceManager { + /// Allocator for system memory and interrupt numbers + pub resource_allocator: ResourceAllocator, + /// MMIO bus + pub mmio_bus: Arc, + /// MMIO devices + pub mmio_devices: MMIODeviceManager, + #[cfg(target_arch = "x86_64")] + /// Port IO bus + pub pio_bus: Arc, + #[cfg(target_arch = "x86_64")] + /// Legacy devices + pub legacy_devices: PortIODeviceManager, + /// ACPI devices + pub acpi_devices: ACPIDeviceManager, +} + +impl DeviceManager { + // Adds `O_NONBLOCK` to the stdout flags. + fn set_stdout_nonblocking() { + // SAFETY: Call is safe since parameters are valid. + let flags = unsafe { libc::fcntl(libc::STDOUT_FILENO, libc::F_GETFL, 0) }; + if flags < 0 { + error!("Could not get Firecracker stdout flags."); + } + // SAFETY: Call is safe since parameters are valid. + let rc = + unsafe { libc::fcntl(libc::STDOUT_FILENO, libc::F_SETFL, flags | libc::O_NONBLOCK) }; + if rc < 0 { + error!("Could not set Firecracker stdout to non-blocking."); + } + } + + /// Sets up the serial device. + fn setup_serial_device( + event_manager: &mut EventManager, + ) -> Result>, std::io::Error> { + let serial = Arc::new(Mutex::new(SerialDevice::new( + Some(std::io::stdin()), + SerialOut::Stdout(std::io::stdout()), + )?)); + event_manager.add_subscriber(serial.clone()); + Ok(serial) + } + + #[cfg_attr(target_arch = "aarch64", allow(unused))] + pub fn new( + event_manager: &mut EventManager, + vcpu_exit_evt: &EventFd, + vmfd: &VmFd, + ) -> Result { + let mmio_bus = Arc::new(vm_device::Bus::new()); + + #[cfg(target_arch = "x86_64")] + let pio_bus = Arc::new(vm_device::Bus::new()); + #[cfg(target_arch = "x86_64")] + let legacy_devices = { + Self::set_stdout_nonblocking(); + + // Create serial device + let serial = Self::setup_serial_device(event_manager)?; + let reset_evt = vcpu_exit_evt + .try_clone() + .map_err(DeviceManagerCreateError::EventFd)?; + // Create keyboard emulator for reset event + let i8042 = Arc::new(Mutex::new(I8042Device::new( + reset_evt, + EventFd::new(libc::EFD_NONBLOCK).map_err(DeviceManagerCreateError::EventFd)?, + ))); + + // create pio dev manager with legacy devices + let mut legacy_devices = PortIODeviceManager::new(serial, i8042)?; + legacy_devices.register_devices(&pio_bus, vmfd)?; + legacy_devices + }; + + Ok(DeviceManager { + resource_allocator: ResourceAllocator::new()?, + mmio_bus, + mmio_devices: MMIODeviceManager::new(), + #[cfg(target_arch = "x86_64")] + pio_bus, + #[cfg(target_arch = "x86_64")] + legacy_devices, + acpi_devices: ACPIDeviceManager::new(), + }) + } + + /// Attaches a VirtioDevice device to the device manager and event manager. + pub(crate) fn attach_virtio_device( + &mut self, + mem: &GuestMemoryMmap, + vmfd: &VmFd, + id: String, + device: Arc>, + cmdline: &mut Cmdline, + is_vhost_user: bool, + ) -> Result<(), AttachMmioDeviceError> { + let interrupt = Arc::new(IrqTrigger::new()); + // The device mutex mustn't be locked here otherwise it will deadlock. + let device = MmioTransport::new(mem.clone(), interrupt, device, is_vhost_user); + self.mmio_devices.register_mmio_virtio_for_boot( + vmfd, + &mut self.resource_allocator, + id, + &self.mmio_bus, + device, + cmdline, + )?; + + Ok(()) + } + + /// Attaches a [`BootTimer`] to the VM + pub(crate) fn attach_boot_timer_device( + &mut self, + request_ts: TimestampUs, + ) -> Result<(), AttachMmioDeviceError> { + let boot_timer = Arc::new(Mutex::new(BootTimer::new(request_ts))); + + self.mmio_devices.register_mmio_boot_timer( + &self.mmio_bus, + &mut self.resource_allocator, + boot_timer, + )?; + + Ok(()) + } + + pub(crate) fn attach_vmgenid_device( + &mut self, + mem: &GuestMemoryMmap, + vmfd: &VmFd, + ) -> Result<(), AttachVmgenidError> { + let vmgenid = VmGenId::new(mem, &mut self.resource_allocator)?; + self.acpi_devices.attach_vmgenid(vmgenid, vmfd)?; + Ok(()) + } + + #[cfg(target_arch = "aarch64")] + pub(crate) fn attach_legacy_devices_aarch64( + &mut self, + vmfd: &VmFd, + event_manager: &mut EventManager, + cmdline: &mut Cmdline, + ) -> Result<(), AttachLegacyMmioDeviceError> { + // Serial device setup. + let cmdline_contains_console = cmdline + .as_cstring() + .map_err(|_| AttachLegacyMmioDeviceError::Cmdline)? + .into_string() + .map_err(|_| AttachLegacyMmioDeviceError::Cmdline)? + .contains("console="); + + if cmdline_contains_console { + // Make stdout non-blocking. + Self::set_stdout_nonblocking(); + let serial = Self::setup_serial_device(event_manager)?; + self.mmio_devices.register_mmio_serial( + vmfd, + &self.mmio_bus, + &mut self.resource_allocator, + serial, + None, + )?; + self.mmio_devices.add_mmio_serial_to_cmdline(cmdline)?; + } + + let rtc = Arc::new(Mutex::new(RTCDevice::new())); + self.mmio_devices.register_mmio_rtc( + &self.mmio_bus, + &mut self.resource_allocator, + rtc, + None, + )?; + Ok(()) + } +} + +#[derive(Debug, Default, Clone, Serialize, Deserialize)] +/// State of devices in the system +pub struct DevicesState { + /// MMIO devices state + pub mmio_state: persist::DeviceStates, + /// ACPI devices state + pub acpi_state: persist::ACPIDeviceManagerState, +} + +#[derive(Debug, thiserror::Error, displaydoc::Display)] +pub enum DevicePersistError { + /// Error restoring MMIO devices: {0} + MmioRestore(#[from] persist::DevicePersistError), + /// Error restoring ACPI devices: {0} + AcpiRestore(#[from] persist::ACPIDeviceManagerRestoreError), + /// Error notifying VMGenID device: {0} + VmGenidUpdate(#[from] std::io::Error), + /// Error resetting serial console: {0} + SerialRestore(#[from] EmulateSerialInitError), + /// Error inserting device in bus: {0} + Bus(#[from] vm_device::BusError), +} + +pub struct DeviceRestoreArgs<'a> { + pub mem: &'a GuestMemoryMmap, + pub vm: &'a VmFd, + pub event_manager: &'a mut EventManager, + pub vm_resources: &'a mut VmResources, + pub instance_id: &'a str, + pub restored_from_file: bool, +} + +impl DeviceManager { + pub fn save(&self) -> DevicesState { + DevicesState { + mmio_state: self.mmio_devices.save(), + acpi_state: self.acpi_devices.save(), + } + } + + /// Sets RDA bit in serial console + pub fn emulate_serial_init(&self) -> Result<(), EmulateSerialInitError> { + // When restoring from a previously saved state, there is no serial + // driver initialization, therefore the RDA (Received Data Available) + // interrupt is not enabled. Because of that, the driver won't get + // notified of any bytes that we send to the guest. The clean solution + // would be to save the whole serial device state when we do the vm + // serialization. For now we set that bit manually + + #[cfg(target_arch = "aarch64")] + { + if let Some(device) = &self.mmio_devices.serial { + let mut device_locked = device.inner.lock().expect("Poisoned lock"); + + device_locked + .serial + .write(IER_RDA_OFFSET, IER_RDA_BIT) + .map_err(|_| EmulateSerialInitError(std::io::Error::last_os_error()))?; + } + Ok(()) + } + + #[cfg(target_arch = "x86_64")] + { + let mut serial = self + .legacy_devices + .stdio_serial + .lock() + .expect("Poisoned lock"); + + serial + .serial + .write(IER_RDA_OFFSET, IER_RDA_BIT) + .map_err(|_| EmulateSerialInitError(std::io::Error::last_os_error()))?; + Ok(()) + } + } + + pub fn restore( + &mut self, + state: &DevicesState, + restore_args: DeviceRestoreArgs, + ) -> Result<(), DevicePersistError> { + // Restore MMIO devices + let mmio_ctor_args = MMIODevManagerConstructorArgs { + mmio_bus: &self.mmio_bus, + mem: restore_args.mem, + vm: restore_args.vm, + event_manager: restore_args.event_manager, + resource_allocator: &mut self.resource_allocator, + vm_resources: restore_args.vm_resources, + instance_id: restore_args.instance_id, + restored_from_file: restore_args.restored_from_file, + }; + self.mmio_devices = MMIODeviceManager::restore(mmio_ctor_args, &state.mmio_state)?; + + // Restore serial. + // We need to do that after we restore mmio devices, otherwise it won't succeed in Aarch64 + self.emulate_serial_init()?; + + // Restore ACPI devices + let acpi_ctor_args = ACPIDeviceManagerConstructorArgs { + mem: restore_args.mem, + resource_allocator: &mut self.resource_allocator, + vm: restore_args.vm, + }; + self.acpi_devices = ACPIDeviceManager::restore(acpi_ctor_args, &state.acpi_state)?; + self.acpi_devices.notify_vmgenid()?; + + Ok(()) + } +} + +#[cfg(test)] +pub(crate) mod tests { + use super::*; + #[cfg(target_arch = "aarch64")] + use crate::builder::tests::default_vmm; + + pub(crate) fn default_device_manager() -> DeviceManager { + let mmio_bus = Arc::new(vm_device::Bus::new()); + #[cfg(target_arch = "x86_64")] + let pio_bus = Arc::new(vm_device::Bus::new()); + let mmio_devices = MMIODeviceManager::new(); + let acpi_devices = ACPIDeviceManager::new(); + let resource_allocator = ResourceAllocator::new().unwrap(); + + #[cfg(target_arch = "x86_64")] + let legacy_devices = PortIODeviceManager::new( + Arc::new(Mutex::new( + SerialDevice::new(None, SerialOut::Sink(std::io::sink())).unwrap(), + )), + Arc::new(Mutex::new(I8042Device::new( + EventFd::new(libc::EFD_NONBLOCK).unwrap(), + EventFd::new(libc::EFD_NONBLOCK).unwrap(), + ))), + ) + .unwrap(); + + DeviceManager { + resource_allocator, + mmio_bus, + mmio_devices, + #[cfg(target_arch = "x86_64")] + pio_bus, + #[cfg(target_arch = "x86_64")] + legacy_devices, + acpi_devices, + } + } + + #[cfg(target_arch = "aarch64")] + #[test] + fn test_attach_legacy_serial() { + let mut vmm = default_vmm(); + assert!(vmm.device_manager.mmio_devices.rtc.is_none()); + assert!(vmm.device_manager.mmio_devices.serial.is_none()); + + let mut cmdline = Cmdline::new(4096).unwrap(); + let mut event_manager = EventManager::new().unwrap(); + vmm.device_manager + .attach_legacy_devices_aarch64(vmm.vm.fd(), &mut event_manager, &mut cmdline) + .unwrap(); + assert!(vmm.device_manager.mmio_devices.rtc.is_some()); + assert!(vmm.device_manager.mmio_devices.serial.is_none()); + + let mut vmm = default_vmm(); + cmdline.insert("console", "/dev/blah").unwrap(); + vmm.device_manager + .attach_legacy_devices_aarch64(vmm.vm.fd(), &mut event_manager, &mut cmdline) + .unwrap(); + assert!(vmm.device_manager.mmio_devices.rtc.is_some()); + assert!(vmm.device_manager.mmio_devices.serial.is_some()); + + assert!( + cmdline + .as_cstring() + .unwrap() + .into_string() + .unwrap() + .contains(&format!( + "earlycon=uart,mmio,0x{:08x}", + vmm.device_manager + .mmio_devices + .serial + .as_ref() + .unwrap() + .resources + .addr + )) + ); + } +} diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index a983140ed21..99216ec77e7 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -63,6 +63,8 @@ pub enum DevicePersistError { DeviceManager(#[from] super::mmio::MmioError), /// Mmio transport MmioTransport, + /// Bus error: {0} + Bus(#[from] vm_device::BusError), #[cfg(target_arch = "aarch64")] /// Legacy: {0} Legacy(#[from] std::io::Error), @@ -212,6 +214,7 @@ pub enum SharedDeviceType { } pub struct MMIODevManagerConstructorArgs<'a> { + pub mmio_bus: &'a vm_device::Bus, pub mem: &'a GuestMemoryMmap, pub vm: &'a VmFd, pub event_manager: &'a mut EventManager, @@ -443,13 +446,14 @@ impl<'a> Persist<'a> for MMIODeviceManager { dev_manager.register_mmio_serial( vm, + constructor_args.mmio_bus, constructor_args.resource_allocator, serial, Some(state.device_info), )?; } if state.type_ == DeviceType::Rtc { - let rtc = RTCDevice::new(); + let rtc = Arc::new(Mutex::new(RTCDevice::new())); constructor_args .resource_allocator .allocate_mmio_memory( @@ -461,6 +465,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { DevicePersistError::DeviceManager(super::mmio::MmioError::Allocator(e)) })?; dev_manager.register_mmio_rtc( + constructor_args.mmio_bus, constructor_args.resource_allocator, rtc, Some(state.device_info), @@ -476,6 +481,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { state: &MmioTransportState, interrupt: Arc, device_info: &MMIODeviceInfo, + mmio_bus: &vm_device::Bus, event_manager: &mut EventManager| -> Result<(), Self::Error> { let restore_args = MmioTransportConstructorArgs { @@ -484,8 +490,10 @@ impl<'a> Persist<'a> for MMIODeviceManager { device, is_vhost_user, }; - let mmio_transport = MmioTransport::restore(restore_args, state) - .map_err(|()| DevicePersistError::MmioTransport)?; + let mmio_transport = Arc::new(Mutex::new( + MmioTransport::restore(restore_args, state) + .map_err(|()| DevicePersistError::MmioTransport)?, + )); // We do not currently require exact re-allocation of IDs via // `dev_manager.irq_allocator.allocate_id()` and currently cannot do @@ -508,7 +516,15 @@ impl<'a> Persist<'a> for MMIODeviceManager { DevicePersistError::DeviceManager(super::mmio::MmioError::Allocator(e)) })?; - dev_manager.register_mmio_virtio(vm, id.clone(), mmio_transport, device_info)?; + dev_manager.register_mmio_virtio( + vm, + id.clone(), + mmio_bus, + MMIODevice { + resources: *device_info, + inner: mmio_transport, + }, + )?; event_manager.add_subscriber(as_subscriber); Ok(()) @@ -537,6 +553,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { &balloon_state.transport_state, interrupt, &balloon_state.device_info, + constructor_args.mmio_bus, constructor_args.event_manager, )?; } @@ -563,6 +580,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { &block_state.transport_state, interrupt, &block_state.device_info, + constructor_args.mmio_bus, constructor_args.event_manager, )?; } @@ -611,6 +629,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { &net_state.transport_state, interrupt, &net_state.device_info, + constructor_args.mmio_bus, constructor_args.event_manager, )?; } @@ -642,6 +661,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { &vsock_state.transport_state, interrupt, &vsock_state.device_info, + constructor_args.mmio_bus, constructor_args.event_manager, )?; } @@ -667,6 +687,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { &entropy_state.transport_state, interrupt, &entropy_state.device_info, + constructor_args.mmio_bus, constructor_args.event_manager, )?; } @@ -726,18 +747,6 @@ mod tests { } } - impl MMIODeviceManager { - fn soft_clone(&self) -> Self { - // We can unwrap here as we create with values directly in scope we - // know will results in `Ok` - let mut clone = MMIODeviceManager::new(); - // We only care about the device hashmap. - clone.virtio_devices.clone_from(&self.virtio_devices); - clone.boot_timer = self.boot_timer.clone(); - clone - } - } - impl PartialEq for MMIODevice { fn eq(&self, other: &Self) -> bool { self.resources == other.resources @@ -770,7 +779,7 @@ mod tests { let mut resource_allocator = ResourceAllocator::new().unwrap(); tmp_sock_file.remove().unwrap(); // Set up a vmm with one of each device, and get the serialized DeviceStates. - let original_mmio_device_manager = { + { let mut event_manager = EventManager::new().expect("Unable to create EventManager"); let mut vmm = default_vmm(); let mut cmdline = default_kernel_cmdline(); @@ -820,11 +829,9 @@ mod tests { let entropy_config = EntropyDeviceConfig::default(); insert_entropy_device(&mut vmm, &mut cmdline, &mut event_manager, entropy_config); - Snapshot::serialize(&mut buf.as_mut_slice(), &vmm.mmio_device_manager.save()).unwrap(); + Snapshot::serialize(&mut buf.as_mut_slice(), &vmm.device_manager.save()).unwrap(); + } - // We only want to keep the device map from the original MmioDeviceManager. - vmm.mmio_device_manager.soft_clone() - }; tmp_sock_file.remove().unwrap(); let mut event_manager = EventManager::new().expect("Unable to create EventManager"); @@ -832,6 +839,7 @@ mod tests { let device_states: DeviceStates = Snapshot::deserialize(&mut buf.as_slice()).unwrap(); let vm_resources = &mut VmResources::default(); let restore_args = MMIODevManagerConstructorArgs { + mmio_bus: &vmm.device_manager.mmio_bus, mem: vmm.vm.guest_memory(), vm: vmm.vm.fd(), event_manager: &mut event_manager, @@ -840,7 +848,7 @@ mod tests { instance_id: "microvm-id", restored_from_file: true, }; - let restored_dev_manager = + let _restored_dev_manager = MMIODeviceManager::restore(restore_args, &device_states).unwrap(); let expected_vm_resources = format!( @@ -917,8 +925,6 @@ mod tests { MmdsVersion::V2 ); assert_eq!(device_states.mmds_version.unwrap(), MmdsVersion::V2.into()); - - assert_eq!(restored_dev_manager, original_mmio_device_manager); assert_eq!( expected_vm_resources, serde_json::to_string_pretty(&VmmConfig::from(&*vm_resources)).unwrap() diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 10411c24487..30104890e7d 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -121,8 +121,7 @@ use std::sync::mpsc::RecvTimeoutError; use std::sync::{Arc, Barrier, Mutex}; use std::time::Duration; -use device_manager::acpi::ACPIDeviceManager; -use device_manager::resources::ResourceAllocator; +use device_manager::DeviceManager; use devices::acpi::vmgenid::VmGenIdError; use devices::virtio::device::VirtioDevice; use event_manager::{EventManager as BaseEventManager, EventOps, Events, MutEventSubscriber}; @@ -135,10 +134,6 @@ use vstate::kvm::Kvm; use vstate::vcpu::{self, StartThreadedError, VcpuSendEventError}; use crate::cpu_config::templates::CpuConfiguration; -#[cfg(target_arch = "x86_64")] -use crate::device_manager::legacy::PortIODeviceManager; -use crate::device_manager::mmio::MMIODeviceManager; -use crate::devices::legacy::{IER_RDA_BIT, IER_RDA_OFFSET}; use crate::devices::virtio::balloon::{ BALLOON_DEV_ID, Balloon, BalloonConfig, BalloonError, BalloonStats, }; @@ -148,7 +143,6 @@ use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET}; use crate::logger::{METRICS, MetricsError, error, info, warn}; use crate::persist::{MicrovmState, MicrovmStateError, VmInfo}; use crate::rate_limiter::BucketUpdate; -use crate::snapshot::Persist; use crate::vmm_config::instance_info::{InstanceInfo, VmState}; use crate::vstate::memory::{GuestMemory, GuestMemoryMmap, GuestMemoryRegion}; use crate::vstate::vcpu::VcpuState; @@ -205,17 +199,15 @@ pub const HTTP_MAX_PAYLOAD_SIZE: usize = 51200; /// have permissions to open the KVM fd). #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum VmmError { - /// Failed to allocate guest resource: {0} - AllocateResources(#[from] vm_allocator::Error), #[cfg(target_arch = "aarch64")] /// Invalid command line error. Cmdline, /// Device manager error: {0} - DeviceManager(device_manager::mmio::MmioError), + DeviceManager(#[from] device_manager::DeviceManagerCreateError), + /// MMIO Device manager error: {0} + MmioDeviceManager(device_manager::mmio::MmioError), /// Error getting the KVM dirty bitmap. {0} DirtyBitmap(kvm_ioctls::Error), - /// Event fd error: {0} - EventFd(io::Error), /// I8042 error: {0} I8042Error(devices::legacy::I8042DeviceError), #[cfg(target_arch = "x86_64")] @@ -313,14 +305,8 @@ pub struct Vmm { vcpus_handles: Vec, // Used by Vcpus and devices to initiate teardown; Vmm should never write here. vcpus_exit_evt: EventFd, - - // Allocator for guest resources - resource_allocator: ResourceAllocator, - // Guest VM devices. - mmio_device_manager: MMIODeviceManager, - #[cfg(target_arch = "x86_64")] - pio_device_manager: PortIODeviceManager, - acpi_device_manager: ACPIDeviceManager, + // Device manager + device_manager: DeviceManager, } impl Vmm { @@ -346,7 +332,8 @@ impl Vmm { device_id: &str, ) -> Option>> { let device = self - .mmio_device_manager + .device_manager + .mmio_devices .get_virtio_device(device_type, device_id)?; Some(device.inner.lock().expect("Poisoned lock").device().clone()) @@ -384,10 +371,10 @@ impl Vmm { self.vcpus_handles.reserve(vcpu_count); for mut vcpu in vcpus.drain(..) { - vcpu.set_mmio_bus(self.mmio_device_manager.bus.clone()); + vcpu.set_mmio_bus(self.device_manager.mmio_bus.clone()); #[cfg(target_arch = "x86_64")] vcpu.kvm_vcpu - .set_pio_bus(self.pio_device_manager.io_bus.clone()); + .set_pio_bus(self.device_manager.pio_bus.clone()); self.vcpus_handles .push(vcpu.start_threaded(vcpu_seccomp_filter.clone(), barrier.clone())?); @@ -401,7 +388,7 @@ impl Vmm { /// Sends a resume command to the vCPUs. pub fn resume_vm(&mut self) -> Result<(), VmmError> { - self.mmio_device_manager.kick_devices(); + self.device_manager.mmio_devices.kick_devices(); // Send the events. self.vcpus_handles @@ -445,48 +432,11 @@ impl Vmm { Ok(()) } - /// Sets RDA bit in serial console - pub fn emulate_serial_init(&self) -> Result<(), EmulateSerialInitError> { - // When restoring from a previously saved state, there is no serial - // driver initialization, therefore the RDA (Received Data Available) - // interrupt is not enabled. Because of that, the driver won't get - // notified of any bytes that we send to the guest. The clean solution - // would be to save the whole serial device state when we do the vm - // serialization. For now we set that bit manually - - #[cfg(target_arch = "aarch64")] - { - if let Some(device) = &self.mmio_device_manager.serial { - let mut device_locked = device.inner.lock().expect("Poisoned lock"); - - device_locked - .serial - .write(IER_RDA_OFFSET, IER_RDA_BIT) - .map_err(|_| EmulateSerialInitError(std::io::Error::last_os_error()))?; - } - Ok(()) - } - - #[cfg(target_arch = "x86_64")] - { - let mut serial = self - .pio_device_manager - .stdio_serial - .lock() - .expect("Poisoned lock"); - - serial - .serial - .write(IER_RDA_OFFSET, IER_RDA_BIT) - .map_err(|_| EmulateSerialInitError(std::io::Error::last_os_error()))?; - Ok(()) - } - } - /// Injects CTRL+ALT+DEL keystroke combo in the i8042 device. #[cfg(target_arch = "x86_64")] pub fn send_ctrl_alt_del(&mut self) -> Result<(), VmmError> { - self.pio_device_manager + self.device_manager + .legacy_devices .i8042 .lock() .expect("i8042 lock was poisoned") @@ -511,9 +461,7 @@ impl Vmm { self.vm.save_state(&mpidrs).map_err(SaveVmState)? } }; - let device_states = self.mmio_device_manager.save(); - - let acpi_dev_state = self.acpi_device_manager.save(); + let device_states = self.device_manager.save(); Ok(MicrovmState { vm_info: vm_info.clone(), @@ -521,7 +469,6 @@ impl Vmm { vm_state, vcpu_states, device_states, - acpi_dev_state, }) } @@ -588,13 +535,14 @@ impl Vmm { drive_id: &str, path_on_host: String, ) -> Result<(), VmmError> { - self.mmio_device_manager + self.device_manager + .mmio_devices .with_virtio_device_with_id(TYPE_BLOCK, drive_id, |block: &mut Block| { block .update_disk_image(path_on_host) .map_err(|err| err.to_string()) }) - .map_err(VmmError::DeviceManager) + .map_err(VmmError::MmioDeviceManager) } /// Updates the rate limiter parameters for block device with `drive_id` id. @@ -604,22 +552,24 @@ impl Vmm { rl_bytes: BucketUpdate, rl_ops: BucketUpdate, ) -> Result<(), VmmError> { - self.mmio_device_manager + self.device_manager + .mmio_devices .with_virtio_device_with_id(TYPE_BLOCK, drive_id, |block: &mut Block| { block .update_rate_limiter(rl_bytes, rl_ops) .map_err(|err| err.to_string()) }) - .map_err(VmmError::DeviceManager) + .map_err(VmmError::MmioDeviceManager) } /// Updates the rate limiter parameters for block device with `drive_id` id. pub fn update_vhost_user_block_config(&mut self, drive_id: &str) -> Result<(), VmmError> { - self.mmio_device_manager + self.device_manager + .mmio_devices .with_virtio_device_with_id(TYPE_BLOCK, drive_id, |block: &mut Block| { block.update_config().map_err(|err| err.to_string()) }) - .map_err(VmmError::DeviceManager) + .map_err(VmmError::MmioDeviceManager) } /// Updates the rate limiter parameters for net device with `net_id` id. @@ -631,12 +581,13 @@ impl Vmm { tx_bytes: BucketUpdate, tx_ops: BucketUpdate, ) -> Result<(), VmmError> { - self.mmio_device_manager + self.device_manager + .mmio_devices .with_virtio_device_with_id(TYPE_NET, net_id, |net: &mut Net| { net.patch_rate_limiters(rx_bytes, rx_ops, tx_bytes, tx_ops); Ok(()) }) - .map_err(VmmError::DeviceManager) + .map_err(VmmError::MmioDeviceManager) } /// Returns a reference to the balloon device if present. diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index da5a603d820..b66228aeebe 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -25,7 +25,7 @@ use crate::cpu_config::templates::StaticCpuTemplate; use crate::cpu_config::x86_64::cpuid::CpuidTrait; #[cfg(target_arch = "x86_64")] use crate::cpu_config::x86_64::cpuid::common::get_vendor_id_from_host; -use crate::device_manager::persist::{ACPIDeviceManagerState, DevicePersistError, DeviceStates}; +use crate::device_manager::{DevicePersistError, DevicesState}; use crate::logger::{info, warn}; use crate::resources::VmResources; use crate::seccomp::BpfThreadMap; @@ -81,9 +81,7 @@ pub struct MicrovmState { /// Vcpu states. pub vcpu_states: Vec, /// Device states. - pub device_states: DeviceStates, - /// ACPI devices state. - pub acpi_dev_state: ACPIDeviceManagerState, + pub device_states: DevicesState, } /// This describes the mapping between Firecracker base virtual address and @@ -118,7 +116,7 @@ pub enum MicrovmStateError { /// Operation not allowed: {0} NotAllowed(String), /// Cannot restore devices: {0} - RestoreDevices(DevicePersistError), + RestoreDevices(#[from] DevicePersistError), /// Cannot save Vcpu state: {0} SaveVcpuState(vstate::vcpu::VcpuError), /// Cannot save Vm state: {0} @@ -171,7 +169,8 @@ pub fn create_snapshot( // SAFETY: // This should never fail as we only mark pages only if device has already been activated, // and the address validation was already performed on device activation. - vmm.mmio_device_manager + vmm.device_manager + .mmio_devices .for_each_virtio_device(|_, _, device| { let mmio_dev_locked = device.inner.lock().expect("Poisoned lock"); let d = mmio_dev_locked.locked_device(); @@ -335,7 +334,7 @@ pub fn restore_from_snapshot( ) -> Result>, RestoreFromSnapshotError> { let mut microvm_state = snapshot_state_from_file(¶ms.snapshot_path)?; for entry in ¶ms.network_overrides { - let net_devices = &mut microvm_state.device_states.net_devices; + let net_devices = &mut microvm_state.device_states.mmio_state.net_devices; if let Some(device) = net_devices .iter_mut() .find(|x| x.device_state.id == entry.iface_id) @@ -600,7 +599,6 @@ mod tests { #[cfg(target_arch = "aarch64")] use crate::construct_kvm_mpidrs; use crate::devices::virtio::block::CacheType; - use crate::snapshot::Persist; use crate::vmm_config::balloon::BalloonDeviceConfig; use crate::vmm_config::net::NetworkInterfaceConfig; use crate::vmm_config::vsock::tests::default_config; @@ -661,14 +659,14 @@ mod tests { #[test] fn test_microvm_state_snapshot() { let vmm = default_vmm_with_devices(); - let states = vmm.mmio_device_manager.save(); + let states = vmm.device_manager.save(); // Only checking that all devices are saved, actual device state // is tested by that device's tests. - assert_eq!(states.block_devices.len(), 1); - assert_eq!(states.net_devices.len(), 1); - assert!(states.vsock_device.is_some()); - assert!(states.balloon_device.is_some()); + assert_eq!(states.mmio_state.block_devices.len(), 1); + assert_eq!(states.mmio_state.net_devices.len(), 1); + assert!(states.mmio_state.vsock_device.is_some()); + assert!(states.mmio_state.balloon_device.is_some()); let vcpu_states = vec![VcpuState::default()]; #[cfg(target_arch = "aarch64")] @@ -685,7 +683,6 @@ mod tests { vm_state: vmm.vm.save_state(&mpidrs).unwrap(), #[cfg(target_arch = "x86_64")] vm_state: vmm.vm.save_state().unwrap(), - acpi_dev_state: vmm.acpi_device_manager.save(), }; let mut buf = vec![0; 10000]; @@ -696,8 +693,8 @@ mod tests { assert_eq!(restored_microvm_state.vm_info, microvm_state.vm_info); assert_eq!( - restored_microvm_state.device_states, - microvm_state.device_states + restored_microvm_state.device_states.mmio_state, + microvm_state.device_states.mmio_state ) } diff --git a/src/vmm/tests/integration_tests.rs b/src/vmm/tests/integration_tests.rs index e513da695c0..6982bf08c5b 100644 --- a/src/vmm/tests/integration_tests.rs +++ b/src/vmm/tests/integration_tests.rs @@ -235,9 +235,29 @@ fn verify_create_snapshot(is_diff: bool) -> (TempFile, TempFile) { // Verify deserialized data. // The default vmm has no devices and one vCPU. - assert_eq!(restored_microvm_state.device_states.block_devices.len(), 0); - assert_eq!(restored_microvm_state.device_states.net_devices.len(), 0); - assert!(restored_microvm_state.device_states.vsock_device.is_none()); + assert_eq!( + restored_microvm_state + .device_states + .mmio_state + .block_devices + .len(), + 0 + ); + assert_eq!( + restored_microvm_state + .device_states + .mmio_state + .net_devices + .len(), + 0 + ); + assert!( + restored_microvm_state + .device_states + .mmio_state + .vsock_device + .is_none() + ); assert_eq!(restored_microvm_state.vcpu_states.len(), 1); (snapshot_file, memory_file) diff --git a/tests/integration_tests/functional/test_api.py b/tests/integration_tests/functional/test_api.py index 24283228f9a..ab2546c861f 100644 --- a/tests/integration_tests/functional/test_api.py +++ b/tests/integration_tests/functional/test_api.py @@ -772,7 +772,7 @@ def test_send_ctrl_alt_del(uvm_plain_any): def _drive_patch(test_microvm, io_engine): """Exercise drive patch test scenarios.""" # Patches without mandatory fields for virtio block are not allowed. - expected_msg = "Unable to patch the block device: Device manager error: Running method expected different backend. Please verify the request arguments" + expected_msg = "Unable to patch the block device: MMIO Device manager error: Running method expected different backend. Please verify the request arguments" with pytest.raises(RuntimeError, match=expected_msg): test_microvm.api.drive.patch(drive_id="scratch") @@ -814,7 +814,7 @@ def _drive_patch(test_microvm, io_engine): ) # Updates to `path_on_host` with an invalid path are not allowed. - expected_msg = f"Unable to patch the block device: Device manager error: Virtio backend error: Error manipulating the backing file: No such file or directory (os error 2) {drive_path} Please verify the request arguments" + expected_msg = f"Unable to patch the block device: MMIO Device manager error: Virtio backend error: Error manipulating the backing file: No such file or directory (os error 2) {drive_path} Please verify the request arguments" with pytest.raises(RuntimeError, match=re.escape(expected_msg)): test_microvm.api.drive.patch(drive_id="scratch", path_on_host=drive_path) From 7266b4b89cfa4b02ade343a33e4956ce68043edc Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 7 May 2025 10:26:38 +0200 Subject: [PATCH 12/56] refactor: simplify creation of I8042 device We always create anew the keyboard interrupt event. Just create it inside `I8042Device::new()` and return an error if that fails. Signed-off-by: Babis Chalios --- src/vmm/src/device_manager/legacy.rs | 7 +++---- src/vmm/src/device_manager/mod.rs | 12 ++++-------- src/vmm/src/devices/legacy/i8042.rs | 28 ++++++++-------------------- 3 files changed, 15 insertions(+), 32 deletions(-) diff --git a/src/vmm/src/device_manager/legacy.rs b/src/vmm/src/device_manager/legacy.rs index a2866f14415..cedb7abc32c 100644 --- a/src/vmm/src/device_manager/legacy.rs +++ b/src/vmm/src/device_manager/legacy.rs @@ -259,10 +259,9 @@ mod tests { ), input: None, })), - Arc::new(Mutex::new(I8042Device::new( - EventFd::new(libc::EFD_NONBLOCK).unwrap(), - EventFd::new(libc::EFD_NONBLOCK).unwrap(), - ))), + Arc::new(Mutex::new( + I8042Device::new(EventFd::new(libc::EFD_NONBLOCK).unwrap()).unwrap(), + )), ) .unwrap(); ldm.register_devices(&io_bus, vm.fd()).unwrap(); diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index 8aec41ffa11..3e3f0f0ffda 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -159,10 +159,7 @@ impl DeviceManager { .try_clone() .map_err(DeviceManagerCreateError::EventFd)?; // Create keyboard emulator for reset event - let i8042 = Arc::new(Mutex::new(I8042Device::new( - reset_evt, - EventFd::new(libc::EFD_NONBLOCK).map_err(DeviceManagerCreateError::EventFd)?, - ))); + let i8042 = Arc::new(Mutex::new(I8042Device::new(reset_evt)?)); // create pio dev manager with legacy devices let mut legacy_devices = PortIODeviceManager::new(serial, i8042)?; @@ -405,10 +402,9 @@ pub(crate) mod tests { Arc::new(Mutex::new( SerialDevice::new(None, SerialOut::Sink(std::io::sink())).unwrap(), )), - Arc::new(Mutex::new(I8042Device::new( - EventFd::new(libc::EFD_NONBLOCK).unwrap(), - EventFd::new(libc::EFD_NONBLOCK).unwrap(), - ))), + Arc::new(Mutex::new( + I8042Device::new(EventFd::new(libc::EFD_NONBLOCK).unwrap()).unwrap(), + )), ) .unwrap(); diff --git a/src/vmm/src/devices/legacy/i8042.rs b/src/vmm/src/devices/legacy/i8042.rs index 1bc830bd13b..235ce2a7339 100644 --- a/src/vmm/src/devices/legacy/i8042.rs +++ b/src/vmm/src/devices/legacy/i8042.rs @@ -119,10 +119,10 @@ pub struct I8042Device { impl I8042Device { /// Constructs an i8042 device that will signal the given event when the guest requests it. - pub fn new(reset_evt: EventFd, kbd_interrupt_evt: EventFd) -> I8042Device { - I8042Device { + pub fn new(reset_evt: EventFd) -> Result { + Ok(I8042Device { reset_evt, - kbd_interrupt_evt, + kbd_interrupt_evt: EventFd::new(libc::EFD_NONBLOCK)?, control: CB_POST_OK | CB_KBD_INT, cmd: 0, outp: 0, @@ -130,7 +130,7 @@ impl I8042Device { buf: [0; BUF_SIZE], bhead: Wrapping(0), btail: Wrapping(0), - } + }) } /// Signal a ctrl-alt-del (reset) event. @@ -355,10 +355,7 @@ mod tests { #[test] fn test_i8042_read_write_and_event() { - let mut i8042 = I8042Device::new( - EventFd::new(libc::EFD_NONBLOCK).unwrap(), - EventFd::new(libc::EFD_NONBLOCK).unwrap(), - ); + let mut i8042 = I8042Device::new(EventFd::new(libc::EFD_NONBLOCK).unwrap()).unwrap(); let reset_evt = i8042.reset_evt.try_clone().unwrap(); // Check if reading in a 2-length array doesn't have side effects. @@ -395,10 +392,7 @@ mod tests { #[test] fn test_i8042_commands() { - let mut i8042 = I8042Device::new( - EventFd::new(libc::EFD_NONBLOCK).unwrap(), - EventFd::new(libc::EFD_NONBLOCK).unwrap(), - ); + let mut i8042 = I8042Device::new(EventFd::new(libc::EFD_NONBLOCK).unwrap()).unwrap(); let mut data = [1]; // Test reading/writing the control register. @@ -435,10 +429,7 @@ mod tests { #[test] fn test_i8042_buffer() { - let mut i8042 = I8042Device::new( - EventFd::new(libc::EFD_NONBLOCK).unwrap(), - EventFd::new(libc::EFD_NONBLOCK).unwrap(), - ); + let mut i8042 = I8042Device::new(EventFd::new(libc::EFD_NONBLOCK).unwrap()).unwrap(); // Test push/pop. i8042.push_byte(52).unwrap(); @@ -462,10 +453,7 @@ mod tests { #[test] fn test_i8042_kbd() { - let mut i8042 = I8042Device::new( - EventFd::new(libc::EFD_NONBLOCK).unwrap(), - EventFd::new(libc::EFD_NONBLOCK).unwrap(), - ); + let mut i8042 = I8042Device::new(EventFd::new(libc::EFD_NONBLOCK).unwrap()).unwrap(); fn expect_key(i8042: &mut I8042Device, key: u16) { let mut data = [1]; From db5de04a6de9d96f976c98b25b859dbe5838ae34 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Mon, 12 May 2025 21:12:34 +0200 Subject: [PATCH 13/56] test: add network interface to test_serial_dos test test_serial_dos test checks that when we send a lot of bytes in the serial device the emulation logic does not increase indefinitely the underlying buffer that we use for when the device is set in loopback mode. However, the test does not wait for the microVM to start and sometimes the virtual memory allocation may increase between readings. Add a network device to the microVM so that we implicitly wait until it has booted before taking the first measurement. Signed-off-by: Babis Chalios --- tests/integration_tests/functional/test_serial_io.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration_tests/functional/test_serial_io.py b/tests/integration_tests/functional/test_serial_io.py index aee9047f531..7a8c0b8c79d 100644 --- a/tests/integration_tests/functional/test_serial_io.py +++ b/tests/integration_tests/functional/test_serial_io.py @@ -146,6 +146,7 @@ def test_serial_dos(uvm_plain_any): vcpu_count=1, boot_args="console=ttyS0 reboot=k panic=1 pci=off", ) + microvm.add_net_iface() microvm.start() # Open an fd for firecracker process terminal. From 17883b1daf7dc6d5e746ef0c329f6efba1eedab6 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Mon, 28 Apr 2025 11:01:54 +0200 Subject: [PATCH 14/56] pci: add pci crate from Cloud Hypervisor Bring in pci crate from cloud hypervisor with a few modifications. We use the rust-vmm vm-allocator crate instead of Cloud Hypervisor's downstream one. For the time being, rust-vmm's implementation should include all we need for supporting the devices we care about. If we need more functionality from our allocators, we will implement the logic directly in the rust-vmm vm-allocator crate. Signed-off-by: Babis Chalios --- Cargo.lock | 23 + src/pci/Cargo.toml | 25 + src/pci/src/bus.rs | 477 +++++++++++ src/pci/src/configuration.rs | 1252 ++++++++++++++++++++++++++++ src/pci/src/device.rs | 136 +++ src/pci/src/lib.rs | 198 +++++ src/pci/src/msi.rs | 282 +++++++ src/pci/src/msix.rs | 552 ++++++++++++ src/vmm/Cargo.toml | 4 + src/vmm/src/device_manager/mmio.rs | 2 +- src/vmm/src/persist.rs | 2 +- 11 files changed, 2951 insertions(+), 2 deletions(-) create mode 100644 src/pci/Cargo.toml create mode 100644 src/pci/src/bus.rs create mode 100644 src/pci/src/configuration.rs create mode 100644 src/pci/src/device.rs create mode 100644 src/pci/src/lib.rs create mode 100644 src/pci/src/msi.rs create mode 100644 src/pci/src/msix.rs diff --git a/Cargo.lock b/Cargo.lock index 82fa7b8c2d8..c74c2191d0e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -250,6 +250,12 @@ version = "3.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "cargo_toml" version = "0.22.1" @@ -1027,6 +1033,20 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" +[[package]] +name = "pci" +version = "0.1.0" +dependencies = [ + "byteorder", + "libc", + "log", + "serde", + "thiserror 2.0.12", + "vm-allocator", + "vm-device", + "vm-memory", +] + [[package]] name = "peeking_take_while" version = "0.1.2" @@ -1646,6 +1666,7 @@ dependencies = [ "log-instrument", "memfd", "micro_http", + "pci", "proptest", "semver", "serde", @@ -1655,8 +1676,10 @@ dependencies = [ "timerfd", "userfaultfd", "utils", + "uuid", "vhost", "vm-allocator", + "vm-device", "vm-fdt", "vm-memory", "vm-superio", diff --git a/src/pci/Cargo.toml b/src/pci/Cargo.toml new file mode 100644 index 00000000000..c88cd270b23 --- /dev/null +++ b/src/pci/Cargo.toml @@ -0,0 +1,25 @@ +[package] +authors = ["Samuel Ortiz "] +edition = "2021" +name = "pci" +version = "0.1.0" +license = "Apache-2.0 AND BSD-3-Clause" + +[lib] +bench = false + +[features] +default = [] + +[dependencies] +byteorder = "1.5.0" +libc = "0.2.172" +log = "0.4.27" +serde = { version = "1.0.219", features = ["derive"] } +thiserror = "2.0.12" +vm-allocator = "0.1.2" +vm-device = { path = "../vm-device" } +vm-memory = { version = "0.16.1", features = [ + "backend-mmap", + "backend-bitmap", +] } diff --git a/src/pci/src/bus.rs b/src/pci/src/bus.rs new file mode 100644 index 00000000000..cb42b4ee9c5 --- /dev/null +++ b/src/pci/src/bus.rs @@ -0,0 +1,477 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +use std::any::Any; +use std::collections::HashMap; +use std::ops::DerefMut; +use std::sync::{Arc, Barrier, Mutex}; + +use byteorder::{ByteOrder, LittleEndian}; +use vm_device::{Bus, BusDevice, BusDeviceSync}; + +use crate::configuration::{ + PciBarRegionType, PciBridgeSubclass, PciClassCode, PciConfiguration, PciHeaderType, +}; +use crate::device::{DeviceRelocation, Error as PciDeviceError, PciDevice}; +use crate::PciBarConfiguration; + +const VENDOR_ID_INTEL: u16 = 0x8086; +const DEVICE_ID_INTEL_VIRT_PCIE_HOST: u16 = 0x0d57; +const NUM_DEVICE_IDS: usize = 32; + +/// Errors for device manager. +#[derive(Debug)] +pub enum PciRootError { + /// Could not allocate device address space for the device. + AllocateDeviceAddrs(PciDeviceError), + /// Could not allocate an IRQ number. + AllocateIrq, + /// Could not add a device to the port io bus. + PioInsert(vm_device::BusError), + /// Could not add a device to the mmio bus. + MmioInsert(vm_device::BusError), + /// Could not find an available device slot on the PCI bus. + NoPciDeviceSlotAvailable, + /// Invalid PCI device identifier provided. + InvalidPciDeviceSlot(usize), + /// Valid PCI device identifier but already used. + AlreadyInUsePciDeviceSlot(usize), +} +pub type Result = std::result::Result; + +/// Emulates the PCI Root bridge device. +pub struct PciRoot { + /// Configuration space. + config: PciConfiguration, +} + +impl PciRoot { + /// Create an empty PCI root bridge. + pub fn new(config: Option) -> Self { + if let Some(config) = config { + PciRoot { config } + } else { + PciRoot { + config: PciConfiguration::new( + VENDOR_ID_INTEL, + DEVICE_ID_INTEL_VIRT_PCIE_HOST, + 0, + PciClassCode::BridgeDevice, + &PciBridgeSubclass::HostBridge, + None, + PciHeaderType::Device, + 0, + 0, + None, + None, + ), + } + } + } +} + +impl BusDevice for PciRoot {} + +impl PciDevice for PciRoot { + fn write_config_register( + &mut self, + reg_idx: usize, + offset: u64, + data: &[u8], + ) -> Option> { + self.config.write_config_register(reg_idx, offset, data); + None + } + + fn read_config_register(&mut self, reg_idx: usize) -> u32 { + self.config.read_reg(reg_idx) + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + fn id(&self) -> Option { + None + } +} + +pub struct PciBus { + /// Devices attached to this bus. + /// Device 0 is host bridge. + devices: HashMap>>, + device_reloc: Arc, + device_ids: Vec, +} + +impl PciBus { + pub fn new(pci_root: PciRoot, device_reloc: Arc) -> Self { + let mut devices: HashMap>> = HashMap::new(); + let mut device_ids: Vec = vec![false; NUM_DEVICE_IDS]; + + devices.insert(0, Arc::new(Mutex::new(pci_root))); + device_ids[0] = true; + + PciBus { + devices, + device_reloc, + device_ids, + } + } + + pub fn register_mapping( + &self, + dev: Arc, + io_bus: &Bus, + mmio_bus: &Bus, + bars: Vec, + ) -> Result<()> { + for bar in bars { + match bar.region_type() { + PciBarRegionType::IoRegion => { + io_bus + .insert(dev.clone(), bar.addr(), bar.size()) + .map_err(PciRootError::PioInsert)?; + } + PciBarRegionType::Memory32BitRegion | PciBarRegionType::Memory64BitRegion => { + mmio_bus + .insert(dev.clone(), bar.addr(), bar.size()) + .map_err(PciRootError::MmioInsert)?; + } + } + } + Ok(()) + } + + pub fn add_device(&mut self, device_id: u32, device: Arc>) -> Result<()> { + self.devices.insert(device_id, device); + Ok(()) + } + + pub fn remove_by_device(&mut self, device: &Arc>) -> Result<()> { + self.devices.retain(|_, dev| !Arc::ptr_eq(dev, device)); + Ok(()) + } + + pub fn next_device_id(&mut self) -> Result { + for (idx, device_id) in self.device_ids.iter_mut().enumerate() { + if !(*device_id) { + *device_id = true; + return Ok(idx as u32); + } + } + + Err(PciRootError::NoPciDeviceSlotAvailable) + } + + pub fn get_device_id(&mut self, id: usize) -> Result<()> { + if id < NUM_DEVICE_IDS { + if !self.device_ids[id] { + self.device_ids[id] = true; + Ok(()) + } else { + Err(PciRootError::AlreadyInUsePciDeviceSlot(id)) + } + } else { + Err(PciRootError::InvalidPciDeviceSlot(id)) + } + } + + pub fn put_device_id(&mut self, id: usize) -> Result<()> { + if id < NUM_DEVICE_IDS { + self.device_ids[id] = false; + Ok(()) + } else { + Err(PciRootError::InvalidPciDeviceSlot(id)) + } + } +} + +pub struct PciConfigIo { + /// Config space register. + config_address: u32, + pci_bus: Arc>, +} + +impl PciConfigIo { + pub fn new(pci_bus: Arc>) -> Self { + PciConfigIo { + config_address: 0, + pci_bus, + } + } + + pub fn config_space_read(&self) -> u32 { + let enabled = (self.config_address & 0x8000_0000) != 0; + if !enabled { + return 0xffff_ffff; + } + + let (bus, device, function, register) = + parse_io_config_address(self.config_address & !0x8000_0000); + + // Only support one bus. + if bus != 0 { + return 0xffff_ffff; + } + + // Don't support multi-function devices. + if function > 0 { + return 0xffff_ffff; + } + + self.pci_bus + .as_ref() + .lock() + .unwrap() + .devices + .get(&(device as u32)) + .map_or(0xffff_ffff, |d| { + d.lock().unwrap().read_config_register(register) + }) + } + + pub fn config_space_write(&mut self, offset: u64, data: &[u8]) -> Option> { + if offset as usize + data.len() > 4 { + return None; + } + + let enabled = (self.config_address & 0x8000_0000) != 0; + if !enabled { + return None; + } + + let (bus, device, _function, register) = + parse_io_config_address(self.config_address & !0x8000_0000); + + // Only support one bus. + if bus != 0 { + return None; + } + + let pci_bus = self.pci_bus.as_ref().lock().unwrap(); + if let Some(d) = pci_bus.devices.get(&(device as u32)) { + let mut device = d.lock().unwrap(); + + // Find out if one of the device's BAR is being reprogrammed, and + // reprogram it if needed. + if let Some(params) = device.detect_bar_reprogramming(register, data) { + if let Err(e) = pci_bus.device_reloc.move_bar( + params.old_base, + params.new_base, + params.len, + device.deref_mut(), + params.region_type, + ) { + error!( + "Failed moving device BAR: {}: 0x{:x}->0x{:x}(0x{:x})", + e, params.old_base, params.new_base, params.len + ); + } + } + + // Update the register value + device.write_config_register(register, offset, data) + } else { + None + } + } + + fn set_config_address(&mut self, offset: u64, data: &[u8]) { + if offset as usize + data.len() > 4 { + return; + } + let (mask, value): (u32, u32) = match data.len() { + 1 => ( + 0x0000_00ff << (offset * 8), + u32::from(data[0]) << (offset * 8), + ), + 2 => ( + 0x0000_ffff << (offset * 16), + ((u32::from(data[1]) << 8) | u32::from(data[0])) << (offset * 16), + ), + 4 => (0xffff_ffff, LittleEndian::read_u32(data)), + _ => return, + }; + self.config_address = (self.config_address & !mask) | value; + } +} + +impl BusDevice for PciConfigIo { + fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { + // `offset` is relative to 0xcf8 + let value = match offset { + 0..=3 => self.config_address, + 4..=7 => self.config_space_read(), + _ => 0xffff_ffff, + }; + + // Only allow reads to the register boundary. + let start = offset as usize % 4; + let end = start + data.len(); + if end <= 4 { + for i in start..end { + data[i - start] = (value >> (i * 8)) as u8; + } + } else { + for d in data { + *d = 0xff; + } + } + } + + fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { + // `offset` is relative to 0xcf8 + match offset { + o @ 0..=3 => { + self.set_config_address(o, data); + None + } + o @ 4..=7 => self.config_space_write(o - 4, data), + _ => None, + } + } +} + +/// Emulates PCI memory-mapped configuration access mechanism. +pub struct PciConfigMmio { + pci_bus: Arc>, +} + +impl PciConfigMmio { + pub fn new(pci_bus: Arc>) -> Self { + PciConfigMmio { pci_bus } + } + + fn config_space_read(&self, config_address: u32) -> u32 { + let (bus, device, _function, register) = parse_mmio_config_address(config_address); + + // Only support one bus. + if bus != 0 { + return 0xffff_ffff; + } + + self.pci_bus + .lock() + .unwrap() + .devices + .get(&(device as u32)) + .map_or(0xffff_ffff, |d| { + d.lock().unwrap().read_config_register(register) + }) + } + + fn config_space_write(&mut self, config_address: u32, offset: u64, data: &[u8]) { + if offset as usize + data.len() > 4 { + return; + } + + let (bus, device, _function, register) = parse_mmio_config_address(config_address); + + // Only support one bus. + if bus != 0 { + return; + } + + let pci_bus = self.pci_bus.lock().unwrap(); + if let Some(d) = pci_bus.devices.get(&(device as u32)) { + let mut device = d.lock().unwrap(); + + // Find out if one of the device's BAR is being reprogrammed, and + // reprogram it if needed. + if let Some(params) = device.detect_bar_reprogramming(register, data) { + if let Err(e) = pci_bus.device_reloc.move_bar( + params.old_base, + params.new_base, + params.len, + device.deref_mut(), + params.region_type, + ) { + error!( + "Failed moving device BAR: {}: 0x{:x}->0x{:x}(0x{:x})", + e, params.old_base, params.new_base, params.len + ); + } + } + + // Update the register value + device.write_config_register(register, offset, data); + } + } +} + +impl BusDevice for PciConfigMmio { + fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { + // Only allow reads to the register boundary. + let start = offset as usize % 4; + let end = start + data.len(); + if end > 4 || offset > u64::from(u32::MAX) { + for d in data { + *d = 0xff; + } + return; + } + + let value = self.config_space_read(offset as u32); + for i in start..end { + data[i - start] = (value >> (i * 8)) as u8; + } + } + + fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { + if offset > u64::from(u32::MAX) { + return None; + } + self.config_space_write(offset as u32, offset % 4, data); + + None + } +} + +fn shift_and_mask(value: u32, offset: usize, mask: u32) -> usize { + ((value >> offset) & mask) as usize +} + +// Parse the MMIO address offset to a (bus, device, function, register) tuple. +// See section 7.2.2 PCI Express Enhanced Configuration Access Mechanism (ECAM) +// from the Pci Express Base Specification Revision 5.0 Version 1.0. +fn parse_mmio_config_address(config_address: u32) -> (usize, usize, usize, usize) { + const BUS_NUMBER_OFFSET: usize = 20; + const BUS_NUMBER_MASK: u32 = 0x00ff; + const DEVICE_NUMBER_OFFSET: usize = 15; + const DEVICE_NUMBER_MASK: u32 = 0x1f; + const FUNCTION_NUMBER_OFFSET: usize = 12; + const FUNCTION_NUMBER_MASK: u32 = 0x07; + const REGISTER_NUMBER_OFFSET: usize = 2; + const REGISTER_NUMBER_MASK: u32 = 0x3ff; + + ( + shift_and_mask(config_address, BUS_NUMBER_OFFSET, BUS_NUMBER_MASK), + shift_and_mask(config_address, DEVICE_NUMBER_OFFSET, DEVICE_NUMBER_MASK), + shift_and_mask(config_address, FUNCTION_NUMBER_OFFSET, FUNCTION_NUMBER_MASK), + shift_and_mask(config_address, REGISTER_NUMBER_OFFSET, REGISTER_NUMBER_MASK), + ) +} + +// Parse the CONFIG_ADDRESS register to a (bus, device, function, register) tuple. +fn parse_io_config_address(config_address: u32) -> (usize, usize, usize, usize) { + const BUS_NUMBER_OFFSET: usize = 16; + const BUS_NUMBER_MASK: u32 = 0x00ff; + const DEVICE_NUMBER_OFFSET: usize = 11; + const DEVICE_NUMBER_MASK: u32 = 0x1f; + const FUNCTION_NUMBER_OFFSET: usize = 8; + const FUNCTION_NUMBER_MASK: u32 = 0x07; + const REGISTER_NUMBER_OFFSET: usize = 2; + const REGISTER_NUMBER_MASK: u32 = 0x3f; + + ( + shift_and_mask(config_address, BUS_NUMBER_OFFSET, BUS_NUMBER_MASK), + shift_and_mask(config_address, DEVICE_NUMBER_OFFSET, DEVICE_NUMBER_MASK), + shift_and_mask(config_address, FUNCTION_NUMBER_OFFSET, FUNCTION_NUMBER_MASK), + shift_and_mask(config_address, REGISTER_NUMBER_OFFSET, REGISTER_NUMBER_MASK), + ) +} diff --git a/src/pci/src/configuration.rs b/src/pci/src/configuration.rs new file mode 100644 index 00000000000..3a53167148c --- /dev/null +++ b/src/pci/src/configuration.rs @@ -0,0 +1,1252 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +use std::fmt::{self, Display}; +use std::sync::{Arc, Mutex}; + +use byteorder::{ByteOrder, LittleEndian}; +use serde::{Deserialize, Serialize}; +use vm_device::PciBarType; + +use crate::device::BarReprogrammingParams; +use crate::{MsixConfig, PciInterruptPin}; + +// The number of 32bit registers in the config space, 4096 bytes. +const NUM_CONFIGURATION_REGISTERS: usize = 1024; + +const STATUS_REG: usize = 1; +const STATUS_REG_CAPABILITIES_USED_MASK: u32 = 0x0010_0000; +const BAR0_REG: usize = 4; +const ROM_BAR_REG: usize = 12; +const ROM_BAR_IDX: usize = 6; +const BAR_IO_ADDR_MASK: u32 = 0xffff_fffc; +const BAR_MEM_ADDR_MASK: u32 = 0xffff_fff0; +const ROM_BAR_ADDR_MASK: u32 = 0xffff_f800; +const MSI_CAPABILITY_REGISTER_MASK: u32 = 0x0071_0000; +const MSIX_CAPABILITY_REGISTER_MASK: u32 = 0xc000_0000; +const NUM_BAR_REGS: usize = 6; +const CAPABILITY_LIST_HEAD_OFFSET: usize = 0x34; +const FIRST_CAPABILITY_OFFSET: usize = 0x40; +const CAPABILITY_MAX_OFFSET: usize = 192; + +const INTERRUPT_LINE_PIN_REG: usize = 15; + +pub const PCI_CONFIGURATION_ID: &str = "pci_configuration"; + +/// Represents the types of PCI headers allowed in the configuration registers. +#[derive(Copy, Clone)] +pub enum PciHeaderType { + Device, + Bridge, +} + +/// Classes of PCI nodes. +#[allow(dead_code)] +#[derive(Copy, Clone)] +pub enum PciClassCode { + TooOld, + MassStorage, + NetworkController, + DisplayController, + MultimediaController, + MemoryController, + BridgeDevice, + SimpleCommunicationController, + BaseSystemPeripheral, + InputDevice, + DockingStation, + Processor, + SerialBusController, + WirelessController, + IntelligentIoController, + EncryptionController, + DataAcquisitionSignalProcessing, + Other = 0xff, +} + +impl PciClassCode { + pub fn get_register_value(self) -> u8 { + self as u8 + } +} + +/// A PCI subclass. Each class in `PciClassCode` can specify a unique set of subclasses. This trait +/// is implemented by each subclass. It allows use of a trait object to generate configurations. +pub trait PciSubclass { + /// Convert this subclass to the value used in the PCI specification. + fn get_register_value(&self) -> u8; +} + +/// Subclasses of the MultimediaController class. +#[allow(dead_code)] +#[derive(Copy, Clone)] +pub enum PciMultimediaSubclass { + VideoController = 0x00, + AudioController = 0x01, + TelephonyDevice = 0x02, + AudioDevice = 0x03, + Other = 0x80, +} + +impl PciSubclass for PciMultimediaSubclass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +/// Subclasses of the BridgeDevice +#[allow(dead_code)] +#[derive(Copy, Clone)] +pub enum PciBridgeSubclass { + HostBridge = 0x00, + IsaBridge = 0x01, + EisaBridge = 0x02, + McaBridge = 0x03, + PciToPciBridge = 0x04, + PcmciaBridge = 0x05, + NuBusBridge = 0x06, + CardBusBridge = 0x07, + RacEwayBridge = 0x08, + PciToPciSemiTransparentBridge = 0x09, + InfiniBrandToPciHostBridge = 0x0a, + OtherBridgeDevice = 0x80, +} + +impl PciSubclass for PciBridgeSubclass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +/// Subclass of the SerialBus +#[allow(dead_code)] +#[derive(Copy, Clone)] +pub enum PciSerialBusSubClass { + Firewire = 0x00, + Accessbus = 0x01, + Ssa = 0x02, + Usb = 0x03, +} + +impl PciSubclass for PciSerialBusSubClass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +/// Mass Storage Sub Classes +#[allow(dead_code)] +#[derive(Copy, Clone)] +pub enum PciMassStorageSubclass { + ScsiStorage = 0x00, + IdeInterface = 0x01, + FloppyController = 0x02, + IpiController = 0x03, + RaidController = 0x04, + AtaController = 0x05, + SataController = 0x06, + SerialScsiController = 0x07, + NvmController = 0x08, + MassStorage = 0x80, +} + +impl PciSubclass for PciMassStorageSubclass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +/// Network Controller Sub Classes +#[allow(dead_code)] +#[derive(Copy, Clone)] +pub enum PciNetworkControllerSubclass { + EthernetController = 0x00, + TokenRingController = 0x01, + FddiController = 0x02, + AtmController = 0x03, + IsdnController = 0x04, + WorldFipController = 0x05, + PicmgController = 0x06, + InfinibandController = 0x07, + FabricController = 0x08, + NetworkController = 0x80, +} + +impl PciSubclass for PciNetworkControllerSubclass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +/// Trait to define a PCI class programming interface +/// +/// Each combination of `PciClassCode` and `PciSubclass` can specify a +/// set of register-level programming interfaces. +/// This trait is implemented by each programming interface. +/// It allows use of a trait object to generate configurations. +pub trait PciProgrammingInterface { + /// Convert this programming interface to the value used in the PCI specification. + fn get_register_value(&self) -> u8; +} + +/// Types of PCI capabilities. +#[derive(PartialEq, Eq, Copy, Clone)] +#[allow(dead_code)] +#[allow(non_camel_case_types)] +#[repr(u8)] +pub enum PciCapabilityId { + ListId = 0, + PowerManagement = 0x01, + AcceleratedGraphicsPort = 0x02, + VitalProductData = 0x03, + SlotIdentification = 0x04, + MessageSignalledInterrupts = 0x05, + CompactPciHotSwap = 0x06, + PciX = 0x07, + HyperTransport = 0x08, + VendorSpecific = 0x09, + Debugport = 0x0A, + CompactPciCentralResourceControl = 0x0B, + PciStandardHotPlugController = 0x0C, + BridgeSubsystemVendorDeviceId = 0x0D, + AgpTargetPciPcibridge = 0x0E, + SecureDevice = 0x0F, + PciExpress = 0x10, + MsiX = 0x11, + SataDataIndexConf = 0x12, + PciAdvancedFeatures = 0x13, + PciEnhancedAllocation = 0x14, +} + +impl From for PciCapabilityId { + fn from(c: u8) -> Self { + match c { + 0 => PciCapabilityId::ListId, + 0x01 => PciCapabilityId::PowerManagement, + 0x02 => PciCapabilityId::AcceleratedGraphicsPort, + 0x03 => PciCapabilityId::VitalProductData, + 0x04 => PciCapabilityId::SlotIdentification, + 0x05 => PciCapabilityId::MessageSignalledInterrupts, + 0x06 => PciCapabilityId::CompactPciHotSwap, + 0x07 => PciCapabilityId::PciX, + 0x08 => PciCapabilityId::HyperTransport, + 0x09 => PciCapabilityId::VendorSpecific, + 0x0A => PciCapabilityId::Debugport, + 0x0B => PciCapabilityId::CompactPciCentralResourceControl, + 0x0C => PciCapabilityId::PciStandardHotPlugController, + 0x0D => PciCapabilityId::BridgeSubsystemVendorDeviceId, + 0x0E => PciCapabilityId::AgpTargetPciPcibridge, + 0x0F => PciCapabilityId::SecureDevice, + 0x10 => PciCapabilityId::PciExpress, + 0x11 => PciCapabilityId::MsiX, + 0x12 => PciCapabilityId::SataDataIndexConf, + 0x13 => PciCapabilityId::PciAdvancedFeatures, + 0x14 => PciCapabilityId::PciEnhancedAllocation, + _ => PciCapabilityId::ListId, + } + } +} + +/// Types of PCI Express capabilities. +#[derive(PartialEq, Eq, Copy, Clone, Debug)] +#[allow(dead_code)] +#[repr(u16)] +pub enum PciExpressCapabilityId { + NullCapability = 0x0000, + AdvancedErrorReporting = 0x0001, + VirtualChannelMultiFunctionVirtualChannelNotPresent = 0x0002, + DeviceSerialNumber = 0x0003, + PowerBudgeting = 0x0004, + RootComplexLinkDeclaration = 0x0005, + RootComplexInternalLinkControl = 0x0006, + RootComplexEventCollectorEndpointAssociation = 0x0007, + MultiFunctionVirtualChannel = 0x0008, + VirtualChannelMultiFunctionVirtualChannelPresent = 0x0009, + RootComplexRegisterBlock = 0x000a, + VendorSpecificExtendedCapability = 0x000b, + ConfigurationAccessCorrelation = 0x000c, + AccessControlServices = 0x000d, + AlternativeRoutingIdentificationInterpretation = 0x000e, + AddressTranslationServices = 0x000f, + SingleRootIoVirtualization = 0x0010, + DeprecatedMultiRootIoVirtualization = 0x0011, + Multicast = 0x0012, + PageRequestInterface = 0x0013, + ReservedForAmd = 0x0014, + ResizeableBar = 0x0015, + DynamicPowerAllocation = 0x0016, + ThpRequester = 0x0017, + LatencyToleranceReporting = 0x0018, + SecondaryPciExpress = 0x0019, + ProtocolMultiplexing = 0x001a, + ProcessAddressSpaceId = 0x001b, + LnRequester = 0x001c, + DownstreamPortContainment = 0x001d, + L1PmSubstates = 0x001e, + PrecisionTimeMeasurement = 0x001f, + PciExpressOverMphy = 0x0020, + FRSQueueing = 0x0021, + ReadinessTimeReporting = 0x0022, + DesignatedVendorSpecificExtendedCapability = 0x0023, + VfResizeableBar = 0x0024, + DataLinkFeature = 0x0025, + PhysicalLayerSixteenGts = 0x0026, + LaneMarginingAtTheReceiver = 0x0027, + HierarchyId = 0x0028, + NativePcieEnclosureManagement = 0x0029, + PhysicalLayerThirtyTwoGts = 0x002a, + AlternateProtocol = 0x002b, + SystemFirmwareIntermediary = 0x002c, + ShadowFunctions = 0x002d, + DataObjectExchange = 0x002e, + Reserved = 0x002f, + ExtendedCapabilitiesAbsence = 0xffff, +} + +impl From for PciExpressCapabilityId { + fn from(c: u16) -> Self { + match c { + 0x0000 => PciExpressCapabilityId::NullCapability, + 0x0001 => PciExpressCapabilityId::AdvancedErrorReporting, + 0x0002 => PciExpressCapabilityId::VirtualChannelMultiFunctionVirtualChannelNotPresent, + 0x0003 => PciExpressCapabilityId::DeviceSerialNumber, + 0x0004 => PciExpressCapabilityId::PowerBudgeting, + 0x0005 => PciExpressCapabilityId::RootComplexLinkDeclaration, + 0x0006 => PciExpressCapabilityId::RootComplexInternalLinkControl, + 0x0007 => PciExpressCapabilityId::RootComplexEventCollectorEndpointAssociation, + 0x0008 => PciExpressCapabilityId::MultiFunctionVirtualChannel, + 0x0009 => PciExpressCapabilityId::VirtualChannelMultiFunctionVirtualChannelPresent, + 0x000a => PciExpressCapabilityId::RootComplexRegisterBlock, + 0x000b => PciExpressCapabilityId::VendorSpecificExtendedCapability, + 0x000c => PciExpressCapabilityId::ConfigurationAccessCorrelation, + 0x000d => PciExpressCapabilityId::AccessControlServices, + 0x000e => PciExpressCapabilityId::AlternativeRoutingIdentificationInterpretation, + 0x000f => PciExpressCapabilityId::AddressTranslationServices, + 0x0010 => PciExpressCapabilityId::SingleRootIoVirtualization, + 0x0011 => PciExpressCapabilityId::DeprecatedMultiRootIoVirtualization, + 0x0012 => PciExpressCapabilityId::Multicast, + 0x0013 => PciExpressCapabilityId::PageRequestInterface, + 0x0014 => PciExpressCapabilityId::ReservedForAmd, + 0x0015 => PciExpressCapabilityId::ResizeableBar, + 0x0016 => PciExpressCapabilityId::DynamicPowerAllocation, + 0x0017 => PciExpressCapabilityId::ThpRequester, + 0x0018 => PciExpressCapabilityId::LatencyToleranceReporting, + 0x0019 => PciExpressCapabilityId::SecondaryPciExpress, + 0x001a => PciExpressCapabilityId::ProtocolMultiplexing, + 0x001b => PciExpressCapabilityId::ProcessAddressSpaceId, + 0x001c => PciExpressCapabilityId::LnRequester, + 0x001d => PciExpressCapabilityId::DownstreamPortContainment, + 0x001e => PciExpressCapabilityId::L1PmSubstates, + 0x001f => PciExpressCapabilityId::PrecisionTimeMeasurement, + 0x0020 => PciExpressCapabilityId::PciExpressOverMphy, + 0x0021 => PciExpressCapabilityId::FRSQueueing, + 0x0022 => PciExpressCapabilityId::ReadinessTimeReporting, + 0x0023 => PciExpressCapabilityId::DesignatedVendorSpecificExtendedCapability, + 0x0024 => PciExpressCapabilityId::VfResizeableBar, + 0x0025 => PciExpressCapabilityId::DataLinkFeature, + 0x0026 => PciExpressCapabilityId::PhysicalLayerSixteenGts, + 0x0027 => PciExpressCapabilityId::LaneMarginingAtTheReceiver, + 0x0028 => PciExpressCapabilityId::HierarchyId, + 0x0029 => PciExpressCapabilityId::NativePcieEnclosureManagement, + 0x002a => PciExpressCapabilityId::PhysicalLayerThirtyTwoGts, + 0x002b => PciExpressCapabilityId::AlternateProtocol, + 0x002c => PciExpressCapabilityId::SystemFirmwareIntermediary, + 0x002d => PciExpressCapabilityId::ShadowFunctions, + 0x002e => PciExpressCapabilityId::DataObjectExchange, + 0xffff => PciExpressCapabilityId::ExtendedCapabilitiesAbsence, + _ => PciExpressCapabilityId::Reserved, + } + } +} + +/// A PCI capability list. Devices can optionally specify capabilities in their configuration space. +pub trait PciCapability { + fn bytes(&self) -> &[u8]; + fn id(&self) -> PciCapabilityId; +} + +fn encode_32_bits_bar_size(bar_size: u32) -> Option { + if bar_size > 0 { + return Some(!(bar_size - 1)); + } + None +} + +fn decode_32_bits_bar_size(bar_size: u32) -> Option { + if bar_size > 0 { + return Some(!bar_size + 1); + } + None +} + +fn encode_64_bits_bar_size(bar_size: u64) -> Option<(u32, u32)> { + if bar_size > 0 { + let result = !(bar_size - 1); + let result_hi = (result >> 32) as u32; + let result_lo = (result & 0xffff_ffff) as u32; + return Some((result_hi, result_lo)); + } + None +} + +fn decode_64_bits_bar_size(bar_size_hi: u32, bar_size_lo: u32) -> Option { + let bar_size: u64 = ((bar_size_hi as u64) << 32) | (bar_size_lo as u64); + if bar_size > 0 { + return Some(!bar_size + 1); + } + None +} + +#[derive(Debug, Default, Clone, Copy, Serialize, Deserialize)] +struct PciBar { + addr: u32, + size: u32, + used: bool, + r#type: Option, +} + +#[derive(Serialize, Deserialize)] +pub struct PciConfigurationState { + registers: Vec, + writable_bits: Vec, + bars: Vec, + rom_bar_addr: u32, + rom_bar_size: u32, + rom_bar_used: bool, + last_capability: Option<(usize, usize)>, + msix_cap_reg_idx: Option, +} + +/// Contains the configuration space of a PCI node. +/// +/// See the [specification](https://en.wikipedia.org/wiki/PCI_configuration_space). +/// The configuration space is accessed with DWORD reads and writes from the guest. +pub struct PciConfiguration { + registers: [u32; NUM_CONFIGURATION_REGISTERS], + writable_bits: [u32; NUM_CONFIGURATION_REGISTERS], // writable bits for each register. + bars: [PciBar; NUM_BAR_REGS], + rom_bar_addr: u32, + rom_bar_size: u32, + rom_bar_used: bool, + // Contains the byte offset and size of the last capability. + last_capability: Option<(usize, usize)>, + msix_cap_reg_idx: Option, + msix_config: Option>>, +} + +/// See pci_regs.h in kernel +#[derive(Copy, Clone, PartialEq, Eq, Serialize, Deserialize, Debug)] +pub enum PciBarRegionType { + Memory32BitRegion = 0, + IoRegion = 0x01, + Memory64BitRegion = 0x04, +} + +impl From for PciBarRegionType { + fn from(type_: PciBarType) -> Self { + match type_ { + PciBarType::Io => PciBarRegionType::IoRegion, + PciBarType::Mmio32 => PciBarRegionType::Memory32BitRegion, + PciBarType::Mmio64 => PciBarRegionType::Memory64BitRegion, + } + } +} + +impl From for PciBarType { + fn from(val: PciBarRegionType) -> Self { + match val { + PciBarRegionType::IoRegion => PciBarType::Io, + PciBarRegionType::Memory32BitRegion => PciBarType::Mmio32, + PciBarRegionType::Memory64BitRegion => PciBarType::Mmio64, + } + } +} + +#[derive(Copy, Clone)] +pub enum PciBarPrefetchable { + NotPrefetchable = 0, + Prefetchable = 0x08, +} + +impl From for bool { + fn from(val: PciBarPrefetchable) -> Self { + match val { + PciBarPrefetchable::NotPrefetchable => false, + PciBarPrefetchable::Prefetchable => true, + } + } +} + +#[derive(Copy, Clone)] +pub struct PciBarConfiguration { + addr: u64, + size: u64, + idx: usize, + region_type: PciBarRegionType, + prefetchable: PciBarPrefetchable, +} + +#[derive(Debug)] +pub enum Error { + BarAddressInvalid(u64, u64), + BarInUse(usize), + BarInUse64(usize), + BarInvalid(usize), + BarInvalid64(usize), + BarSizeInvalid(u64), + CapabilityEmpty, + CapabilityLengthInvalid(usize), + CapabilitySpaceFull(usize), + Decode32BarSize, + Decode64BarSize, + Encode32BarSize, + Encode64BarSize, + RomBarAddressInvalid(u64, u64), + RomBarInUse(usize), + RomBarInvalid(usize), + RomBarSizeInvalid(u64), +} +pub type Result = std::result::Result; + +impl std::error::Error for Error {} + +impl Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + use self::Error::*; + match self { + BarAddressInvalid(a, s) => write!(f, "address {a} size {s} too big"), + BarInUse(b) => write!(f, "bar {b} already used"), + BarInUse64(b) => write!(f, "64bit bar {b} already used(requires two regs)"), + BarInvalid(b) => write!(f, "bar {} invalid, max {}", b, NUM_BAR_REGS - 1), + BarInvalid64(b) => write!( + f, + "64bitbar {} invalid, requires two regs, max {}", + b, + NUM_BAR_REGS - 1 + ), + BarSizeInvalid(s) => write!(f, "bar address {s} not a power of two"), + CapabilityEmpty => write!(f, "empty capabilities are invalid"), + CapabilityLengthInvalid(l) => write!(f, "Invalid capability length {l}"), + CapabilitySpaceFull(s) => write!(f, "capability of size {s} doesn't fit"), + Decode32BarSize => write!(f, "failed to decode 32 bits BAR size"), + Decode64BarSize => write!(f, "failed to decode 64 bits BAR size"), + Encode32BarSize => write!(f, "failed to encode 32 bits BAR size"), + Encode64BarSize => write!(f, "failed to encode 64 bits BAR size"), + RomBarAddressInvalid(a, s) => write!(f, "address {a} size {s} too big"), + RomBarInUse(b) => write!(f, "rom bar {b} already used"), + RomBarInvalid(b) => write!(f, "rom bar {} invalid, max {}", b, NUM_BAR_REGS - 1), + RomBarSizeInvalid(s) => write!(f, "rom bar address {s} not a power of two"), + } + } +} + +impl PciConfiguration { + #[allow(clippy::too_many_arguments)] + pub fn new( + vendor_id: u16, + device_id: u16, + revision_id: u8, + class_code: PciClassCode, + subclass: &dyn PciSubclass, + programming_interface: Option<&dyn PciProgrammingInterface>, + header_type: PciHeaderType, + subsystem_vendor_id: u16, + subsystem_id: u16, + msix_config: Option>>, + state: Option, + ) -> Self { + let ( + registers, + writable_bits, + bars, + rom_bar_addr, + rom_bar_size, + rom_bar_used, + last_capability, + msix_cap_reg_idx, + ) = if let Some(state) = state { + ( + state.registers.try_into().unwrap(), + state.writable_bits.try_into().unwrap(), + state.bars.try_into().unwrap(), + state.rom_bar_addr, + state.rom_bar_size, + state.rom_bar_used, + state.last_capability, + state.msix_cap_reg_idx, + ) + } else { + let mut registers = [0u32; NUM_CONFIGURATION_REGISTERS]; + let mut writable_bits = [0u32; NUM_CONFIGURATION_REGISTERS]; + registers[0] = (u32::from(device_id) << 16) | u32::from(vendor_id); + // TODO(dverkamp): Status should be write-1-to-clear + writable_bits[1] = 0x0000_ffff; // Status (r/o), command (r/w) + let pi = if let Some(pi) = programming_interface { + pi.get_register_value() + } else { + 0 + }; + registers[2] = (u32::from(class_code.get_register_value()) << 24) + | (u32::from(subclass.get_register_value()) << 16) + | (u32::from(pi) << 8) + | u32::from(revision_id); + writable_bits[3] = 0x0000_00ff; // Cacheline size (r/w) + match header_type { + PciHeaderType::Device => { + registers[3] = 0x0000_0000; // Header type 0 (device) + writable_bits[15] = 0x0000_00ff; // Interrupt line (r/w) + } + PciHeaderType::Bridge => { + registers[3] = 0x0001_0000; // Header type 1 (bridge) + writable_bits[9] = 0xfff0_fff0; // Memory base and limit + writable_bits[15] = 0xffff_00ff; // Bridge control (r/w), interrupt line (r/w) + } + }; + registers[11] = (u32::from(subsystem_id) << 16) | u32::from(subsystem_vendor_id); + + ( + registers, + writable_bits, + [PciBar::default(); NUM_BAR_REGS], + 0, + 0, + false, + None, + None, + ) + }; + + PciConfiguration { + registers, + writable_bits, + bars, + rom_bar_addr, + rom_bar_size, + rom_bar_used, + last_capability, + msix_cap_reg_idx, + msix_config, + } + } + + pub fn state(&self) -> PciConfigurationState { + PciConfigurationState { + registers: self.registers.to_vec(), + writable_bits: self.writable_bits.to_vec(), + bars: self.bars.to_vec(), + rom_bar_addr: self.rom_bar_addr, + rom_bar_size: self.rom_bar_size, + rom_bar_used: self.rom_bar_used, + last_capability: self.last_capability, + msix_cap_reg_idx: self.msix_cap_reg_idx, + } + } + + /// Reads a 32bit register from `reg_idx` in the register map. + pub fn read_reg(&self, reg_idx: usize) -> u32 { + *(self.registers.get(reg_idx).unwrap_or(&0xffff_ffff)) + } + + /// Writes a 32bit register to `reg_idx` in the register map. + pub fn write_reg(&mut self, reg_idx: usize, value: u32) { + let mut mask = self.writable_bits[reg_idx]; + + if (BAR0_REG..BAR0_REG + NUM_BAR_REGS).contains(®_idx) { + // Handle very specific case where the BAR is being written with + // all 1's to retrieve the BAR size during next BAR reading. + if value == 0xffff_ffff { + mask &= self.bars[reg_idx - 4].size; + } + } else if reg_idx == ROM_BAR_REG { + // Handle very specific case where the BAR is being written with + // all 1's on bits 31-11 to retrieve the BAR size during next BAR + // reading. + if value & ROM_BAR_ADDR_MASK == ROM_BAR_ADDR_MASK { + mask &= self.rom_bar_size; + } + } + + if let Some(r) = self.registers.get_mut(reg_idx) { + *r = (*r & !self.writable_bits[reg_idx]) | (value & mask); + } else { + warn!("bad PCI register write {}", reg_idx); + } + } + + /// Writes a 16bit word to `offset`. `offset` must be 16bit aligned. + pub fn write_word(&mut self, offset: usize, value: u16) { + let shift = match offset % 4 { + 0 => 0, + 2 => 16, + _ => { + warn!("bad PCI config write offset {}", offset); + return; + } + }; + let reg_idx = offset / 4; + + if let Some(r) = self.registers.get_mut(reg_idx) { + let writable_mask = self.writable_bits[reg_idx]; + let mask = (0xffffu32 << shift) & writable_mask; + let shifted_value = (u32::from(value) << shift) & writable_mask; + *r = *r & !mask | shifted_value; + } else { + warn!("bad PCI config write offset {}", offset); + } + } + + /// Writes a byte to `offset`. + pub fn write_byte(&mut self, offset: usize, value: u8) { + self.write_byte_internal(offset, value, true); + } + + /// Writes a byte to `offset`, optionally enforcing read-only bits. + fn write_byte_internal(&mut self, offset: usize, value: u8, apply_writable_mask: bool) { + let shift = (offset % 4) * 8; + let reg_idx = offset / 4; + + if let Some(r) = self.registers.get_mut(reg_idx) { + let writable_mask = if apply_writable_mask { + self.writable_bits[reg_idx] + } else { + 0xffff_ffff + }; + let mask = (0xffu32 << shift) & writable_mask; + let shifted_value = (u32::from(value) << shift) & writable_mask; + *r = *r & !mask | shifted_value; + } else { + warn!("bad PCI config write offset {}", offset); + } + } + + /// Adds a region specified by `config`. Configures the specified BAR(s) to + /// report this region and size to the guest kernel. Enforces a few constraints + /// (i.e, region size must be power of two, register not already used). + pub fn add_pci_bar(&mut self, config: &PciBarConfiguration) -> Result<()> { + let bar_idx = config.idx; + let reg_idx = BAR0_REG + bar_idx; + + if self.bars[bar_idx].used { + return Err(Error::BarInUse(bar_idx)); + } + + if !config.size.is_power_of_two() { + return Err(Error::BarSizeInvalid(config.size)); + } + + if bar_idx >= NUM_BAR_REGS { + return Err(Error::BarInvalid(bar_idx)); + } + + let end_addr = config + .addr + .checked_add(config.size - 1) + .ok_or(Error::BarAddressInvalid(config.addr, config.size))?; + match config.region_type { + PciBarRegionType::Memory32BitRegion | PciBarRegionType::IoRegion => { + if end_addr > u64::from(u32::MAX) { + return Err(Error::BarAddressInvalid(config.addr, config.size)); + } + + // Encode the BAR size as expected by the software running in + // the guest. + self.bars[bar_idx].size = + encode_32_bits_bar_size(config.size as u32).ok_or(Error::Encode32BarSize)?; + } + PciBarRegionType::Memory64BitRegion => { + if bar_idx + 1 >= NUM_BAR_REGS { + return Err(Error::BarInvalid64(bar_idx)); + } + + if self.bars[bar_idx + 1].used { + return Err(Error::BarInUse64(bar_idx)); + } + + // Encode the BAR size as expected by the software running in + // the guest. + let (bar_size_hi, bar_size_lo) = + encode_64_bits_bar_size(config.size).ok_or(Error::Encode64BarSize)?; + + self.registers[reg_idx + 1] = (config.addr >> 32) as u32; + self.writable_bits[reg_idx + 1] = 0xffff_ffff; + self.bars[bar_idx + 1].addr = self.registers[reg_idx + 1]; + self.bars[bar_idx].size = bar_size_lo; + self.bars[bar_idx + 1].size = bar_size_hi; + self.bars[bar_idx + 1].used = true; + } + } + + let (mask, lower_bits) = match config.region_type { + PciBarRegionType::Memory32BitRegion | PciBarRegionType::Memory64BitRegion => ( + BAR_MEM_ADDR_MASK, + config.prefetchable as u32 | config.region_type as u32, + ), + PciBarRegionType::IoRegion => (BAR_IO_ADDR_MASK, config.region_type as u32), + }; + + self.registers[reg_idx] = ((config.addr as u32) & mask) | lower_bits; + self.writable_bits[reg_idx] = mask; + self.bars[bar_idx].addr = self.registers[reg_idx]; + self.bars[bar_idx].used = true; + self.bars[bar_idx].r#type = Some(config.region_type); + + Ok(()) + } + + /// Adds rom expansion BAR. + pub fn add_pci_rom_bar(&mut self, config: &PciBarConfiguration, active: u32) -> Result<()> { + let bar_idx = config.idx; + let reg_idx = ROM_BAR_REG; + + if self.rom_bar_used { + return Err(Error::RomBarInUse(bar_idx)); + } + + if !config.size.is_power_of_two() { + return Err(Error::RomBarSizeInvalid(config.size)); + } + + if bar_idx != ROM_BAR_IDX { + return Err(Error::RomBarInvalid(bar_idx)); + } + + let end_addr = config + .addr + .checked_add(config.size - 1) + .ok_or(Error::RomBarAddressInvalid(config.addr, config.size))?; + + if end_addr > u64::from(u32::MAX) { + return Err(Error::RomBarAddressInvalid(config.addr, config.size)); + } + + self.registers[reg_idx] = (config.addr as u32) | active; + self.writable_bits[reg_idx] = ROM_BAR_ADDR_MASK; + self.rom_bar_addr = self.registers[reg_idx]; + self.rom_bar_size = + encode_32_bits_bar_size(config.size as u32).ok_or(Error::Encode32BarSize)?; + self.rom_bar_used = true; + + Ok(()) + } + + /// Returns the address of the given BAR region. + pub fn get_bar_addr(&self, bar_num: usize) -> u64 { + let bar_idx = BAR0_REG + bar_num; + + let mut addr = u64::from(self.bars[bar_num].addr & self.writable_bits[bar_idx]); + + if let Some(bar_type) = self.bars[bar_num].r#type { + if bar_type == PciBarRegionType::Memory64BitRegion { + addr |= u64::from(self.bars[bar_num + 1].addr) << 32; + } + } + + addr + } + + /// Configures the IRQ line and pin used by this device. + pub fn set_irq(&mut self, line: u8, pin: PciInterruptPin) { + // `pin` is 1-based in the pci config space. + let pin_idx = (pin as u32) + 1; + self.registers[INTERRUPT_LINE_PIN_REG] = (self.registers[INTERRUPT_LINE_PIN_REG] + & 0xffff_0000) + | (pin_idx << 8) + | u32::from(line); + } + + /// Adds the capability `cap_data` to the list of capabilities. + /// `cap_data` should include the two-byte PCI capability header (type, next), + /// but not populate it. Correct values will be generated automatically based + /// on `cap_data.id()`. + pub fn add_capability(&mut self, cap_data: &dyn PciCapability) -> Result { + let total_len = cap_data.bytes().len(); + // Check that the length is valid. + if cap_data.bytes().is_empty() { + return Err(Error::CapabilityEmpty); + } + let (cap_offset, tail_offset) = match self.last_capability { + Some((offset, len)) => (Self::next_dword(offset, len), offset + 1), + None => (FIRST_CAPABILITY_OFFSET, CAPABILITY_LIST_HEAD_OFFSET), + }; + let end_offset = cap_offset + .checked_add(total_len) + .ok_or(Error::CapabilitySpaceFull(total_len))?; + if end_offset > CAPABILITY_MAX_OFFSET { + return Err(Error::CapabilitySpaceFull(total_len)); + } + self.registers[STATUS_REG] |= STATUS_REG_CAPABILITIES_USED_MASK; + self.write_byte_internal(tail_offset, cap_offset as u8, false); + self.write_byte_internal(cap_offset, cap_data.id() as u8, false); + self.write_byte_internal(cap_offset + 1, 0, false); // Next pointer. + for (i, byte) in cap_data.bytes().iter().enumerate() { + self.write_byte_internal(cap_offset + i + 2, *byte, false); + } + self.last_capability = Some((cap_offset, total_len)); + + match cap_data.id() { + PciCapabilityId::MessageSignalledInterrupts => { + self.writable_bits[cap_offset / 4] = MSI_CAPABILITY_REGISTER_MASK; + } + PciCapabilityId::MsiX => { + self.msix_cap_reg_idx = Some(cap_offset / 4); + self.writable_bits[self.msix_cap_reg_idx.unwrap()] = MSIX_CAPABILITY_REGISTER_MASK; + } + _ => {} + } + + Ok(cap_offset) + } + + // Find the next aligned offset after the one given. + fn next_dword(offset: usize, len: usize) -> usize { + let next = offset + len; + (next + 3) & !3 + } + + pub fn write_config_register(&mut self, reg_idx: usize, offset: u64, data: &[u8]) { + if offset as usize + data.len() > 4 { + return; + } + + // Handle potential write to MSI-X message control register + if let Some(msix_cap_reg_idx) = self.msix_cap_reg_idx { + if let Some(msix_config) = &self.msix_config { + if msix_cap_reg_idx == reg_idx && offset == 2 && data.len() == 2 { + msix_config + .lock() + .unwrap() + .set_msg_ctl(LittleEndian::read_u16(data)); + } else if msix_cap_reg_idx == reg_idx && offset == 0 && data.len() == 4 { + msix_config + .lock() + .unwrap() + .set_msg_ctl((LittleEndian::read_u32(data) >> 16) as u16); + } + } + } + + match data.len() { + 1 => self.write_byte(reg_idx * 4 + offset as usize, data[0]), + 2 => self.write_word( + reg_idx * 4 + offset as usize, + u16::from(data[0]) | (u16::from(data[1]) << 8), + ), + 4 => self.write_reg(reg_idx, LittleEndian::read_u32(data)), + _ => (), + } + } + + pub fn read_config_register(&self, reg_idx: usize) -> u32 { + self.read_reg(reg_idx) + } + + pub fn detect_bar_reprogramming( + &mut self, + reg_idx: usize, + data: &[u8], + ) -> Option { + if data.len() != 4 { + return None; + } + + let value = LittleEndian::read_u32(data); + + let mask = self.writable_bits[reg_idx]; + if (BAR0_REG..BAR0_REG + NUM_BAR_REGS).contains(®_idx) { + // Ignore the case where the BAR size is being asked for. + if value == 0xffff_ffff { + return None; + } + + let bar_idx = reg_idx - 4; + // Handle special case where the address being written is + // different from the address initially provided. This is a + // BAR reprogramming case which needs to be properly caught. + if let Some(bar_type) = self.bars[bar_idx].r#type { + // In case of 64 bits memory BAR, we don't do anything until + // the upper BAR is modified, otherwise we would be moving the + // BAR to a wrong location in memory. + if bar_type == PciBarRegionType::Memory64BitRegion { + return None; + } + + // Ignore the case where the value is unchanged. + if (value & mask) == (self.bars[bar_idx].addr & mask) { + return None; + } + + info!( + "Detected BAR reprogramming: (BAR {}) 0x{:x}->0x{:x}", + reg_idx, self.registers[reg_idx], value + ); + let old_base = u64::from(self.bars[bar_idx].addr & mask); + let new_base = u64::from(value & mask); + let len = u64::from( + decode_32_bits_bar_size(self.bars[bar_idx].size) + .ok_or(Error::Decode32BarSize) + .unwrap(), + ); + let region_type = bar_type; + + self.bars[bar_idx].addr = value; + + return Some(BarReprogrammingParams { + old_base, + new_base, + len, + region_type, + }); + } else if (reg_idx > BAR0_REG) + && ((self.registers[reg_idx - 1] & self.writable_bits[reg_idx - 1]) + != (self.bars[bar_idx - 1].addr & self.writable_bits[reg_idx - 1]) + || (value & mask) != (self.bars[bar_idx].addr & mask)) + { + info!( + "Detected BAR reprogramming: (BAR {}) 0x{:x}->0x{:x}", + reg_idx, self.registers[reg_idx], value + ); + let old_base = (u64::from(self.bars[bar_idx].addr & mask) << 32) + | u64::from(self.bars[bar_idx - 1].addr & self.writable_bits[reg_idx - 1]); + let new_base = (u64::from(value & mask) << 32) + | u64::from(self.registers[reg_idx - 1] & self.writable_bits[reg_idx - 1]); + let len = + decode_64_bits_bar_size(self.bars[bar_idx].size, self.bars[bar_idx - 1].size) + .ok_or(Error::Decode64BarSize) + .unwrap(); + let region_type = PciBarRegionType::Memory64BitRegion; + + self.bars[bar_idx].addr = value; + self.bars[bar_idx - 1].addr = self.registers[reg_idx - 1]; + + return Some(BarReprogrammingParams { + old_base, + new_base, + len, + region_type, + }); + } + } else if reg_idx == ROM_BAR_REG && (value & mask) != (self.rom_bar_addr & mask) { + // Ignore the case where the BAR size is being asked for. + if value & ROM_BAR_ADDR_MASK == ROM_BAR_ADDR_MASK { + return None; + } + + info!( + "Detected ROM BAR reprogramming: (BAR {}) 0x{:x}->0x{:x}", + reg_idx, self.registers[reg_idx], value + ); + let old_base = u64::from(self.rom_bar_addr & mask); + let new_base = u64::from(value & mask); + let len = u64::from( + decode_32_bits_bar_size(self.rom_bar_size) + .ok_or(Error::Decode32BarSize) + .unwrap(), + ); + let region_type = PciBarRegionType::Memory32BitRegion; + + self.rom_bar_addr = value; + + return Some(BarReprogrammingParams { + old_base, + new_base, + len, + region_type, + }); + } + + None + } +} + +impl Default for PciBarConfiguration { + fn default() -> Self { + PciBarConfiguration { + idx: 0, + addr: 0, + size: 0, + region_type: PciBarRegionType::Memory64BitRegion, + prefetchable: PciBarPrefetchable::NotPrefetchable, + } + } +} + +impl PciBarConfiguration { + pub fn new( + idx: usize, + size: u64, + region_type: PciBarRegionType, + prefetchable: PciBarPrefetchable, + ) -> Self { + PciBarConfiguration { + idx, + addr: 0, + size, + region_type, + prefetchable, + } + } + + #[must_use] + pub fn set_index(mut self, idx: usize) -> Self { + self.idx = idx; + self + } + + #[must_use] + pub fn set_address(mut self, addr: u64) -> Self { + self.addr = addr; + self + } + + #[must_use] + pub fn set_size(mut self, size: u64) -> Self { + self.size = size; + self + } + + #[must_use] + pub fn set_region_type(mut self, region_type: PciBarRegionType) -> Self { + self.region_type = region_type; + self + } + + #[must_use] + pub fn set_prefetchable(mut self, prefetchable: PciBarPrefetchable) -> Self { + self.prefetchable = prefetchable; + self + } + + pub fn idx(&self) -> usize { + self.idx + } + + pub fn addr(&self) -> u64 { + self.addr + } + + pub fn size(&self) -> u64 { + self.size + } + + pub fn region_type(&self) -> PciBarRegionType { + self.region_type + } + + pub fn prefetchable(&self) -> PciBarPrefetchable { + self.prefetchable + } +} + +#[cfg(test)] +mod tests { + use vm_memory::ByteValued; + + use super::*; + + #[repr(C, packed)] + #[derive(Clone, Copy, Default)] + #[allow(dead_code)] + struct TestCap { + len: u8, + foo: u8, + } + + // SAFETY: All members are simple numbers and any value is valid. + unsafe impl ByteValued for TestCap {} + + impl PciCapability for TestCap { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::VendorSpecific + } + } + + #[test] + fn add_capability() { + let mut cfg = PciConfiguration::new( + 0x1234, + 0x5678, + 0x1, + PciClassCode::MultimediaController, + &PciMultimediaSubclass::AudioController, + None, + PciHeaderType::Device, + 0xABCD, + 0x2468, + None, + None, + ); + + // Add two capabilities with different contents. + let cap1 = TestCap { len: 4, foo: 0xAA }; + let cap1_offset = cfg.add_capability(&cap1).unwrap(); + assert_eq!(cap1_offset % 4, 0); + + let cap2 = TestCap { + len: 0x04, + foo: 0x55, + }; + let cap2_offset = cfg.add_capability(&cap2).unwrap(); + assert_eq!(cap2_offset % 4, 0); + + // The capability list head should be pointing to cap1. + let cap_ptr = cfg.read_reg(CAPABILITY_LIST_HEAD_OFFSET / 4) & 0xFF; + assert_eq!(cap1_offset, cap_ptr as usize); + + // Verify the contents of the capabilities. + let cap1_data = cfg.read_reg(cap1_offset / 4); + assert_eq!(cap1_data & 0xFF, 0x09); // capability ID + assert_eq!((cap1_data >> 8) & 0xFF, cap2_offset as u32); // next capability pointer + assert_eq!((cap1_data >> 16) & 0xFF, 0x04); // cap1.len + assert_eq!((cap1_data >> 24) & 0xFF, 0xAA); // cap1.foo + + let cap2_data = cfg.read_reg(cap2_offset / 4); + assert_eq!(cap2_data & 0xFF, 0x09); // capability ID + assert_eq!((cap2_data >> 8) & 0xFF, 0x00); // next capability pointer + assert_eq!((cap2_data >> 16) & 0xFF, 0x04); // cap2.len + assert_eq!((cap2_data >> 24) & 0xFF, 0x55); // cap2.foo + } + + #[derive(Copy, Clone)] + enum TestPi { + Test = 0x5a, + } + + impl PciProgrammingInterface for TestPi { + fn get_register_value(&self) -> u8 { + *self as u8 + } + } + + #[test] + fn class_code() { + let cfg = PciConfiguration::new( + 0x1234, + 0x5678, + 0x1, + PciClassCode::MultimediaController, + &PciMultimediaSubclass::AudioController, + Some(&TestPi::Test), + PciHeaderType::Device, + 0xABCD, + 0x2468, + None, + None, + ); + + let class_reg = cfg.read_reg(2); + let class_code = (class_reg >> 24) & 0xFF; + let subclass = (class_reg >> 16) & 0xFF; + let prog_if = (class_reg >> 8) & 0xFF; + assert_eq!(class_code, 0x04); + assert_eq!(subclass, 0x01); + assert_eq!(prog_if, 0x5a); + } +} diff --git a/src/pci/src/device.rs b/src/pci/src/device.rs new file mode 100644 index 00000000000..d3bd3056a36 --- /dev/null +++ b/src/pci/src/device.rs @@ -0,0 +1,136 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +use std::any::Any; +use std::fmt::{self, Display}; +use std::sync::{Arc, Barrier}; +use std::{io, result}; + +use vm_allocator::AddressAllocator; +use vm_device::Resource; + +use crate::configuration::{self, PciBarRegionType}; +use crate::PciBarConfiguration; + +#[derive(Debug)] +pub enum Error { + /// Setup of the device capabilities failed. + CapabilitiesSetup(configuration::Error), + /// Allocating space for an IO BAR failed. + IoAllocationFailed(u64), + /// Registering an IO BAR failed. + IoRegistrationFailed(u64, configuration::Error), + /// Expected resource not found. + MissingResource, + /// Invalid resource. + InvalidResource(Resource), +} +pub type Result = std::result::Result; + +impl Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + use self::Error::*; + + match self { + CapabilitiesSetup(e) => write!(f, "failed to add capability {e}"), + IoAllocationFailed(size) => { + write!(f, "failed to allocate space for an IO BAR, size={size}") + } + IoRegistrationFailed(addr, e) => { + write!(f, "failed to register an IO BAR, addr={addr} err={e}") + } + MissingResource => write!(f, "failed to find expected resource"), + InvalidResource(r) => write!(f, "invalid resource {r:?}"), + } + } +} + +#[derive(Clone, Copy)] +pub struct BarReprogrammingParams { + pub old_base: u64, + pub new_base: u64, + pub len: u64, + pub region_type: PciBarRegionType, +} + +pub trait PciDevice: Send { + /// Allocates the needed PCI BARs space using the `allocate` function which takes a size and + /// returns an address. Returns a Vec of (GuestAddress, GuestUsize) tuples. + fn allocate_bars( + &mut self, + _mmio32_allocator: &mut AddressAllocator, + _mmio64_allocator: &mut AddressAllocator, + _resources: Option>, + ) -> Result> { + Ok(Vec::new()) + } + + /// Frees the PCI BARs previously allocated with a call to allocate_bars(). + fn free_bars( + &mut self, + _mmio32_allocator: &mut AddressAllocator, + _mmio64_allocator: &mut AddressAllocator, + ) -> Result<()> { + Ok(()) + } + + /// Sets a register in the configuration space. + /// * `reg_idx` - The index of the config register to modify. + /// * `offset` - Offset into the register. + fn write_config_register( + &mut self, + reg_idx: usize, + offset: u64, + data: &[u8], + ) -> Option>; + /// Gets a register from the configuration space. + /// * `reg_idx` - The index of the config register to read. + fn read_config_register(&mut self, reg_idx: usize) -> u32; + /// Detects if a BAR is being reprogrammed. + fn detect_bar_reprogramming( + &mut self, + _reg_idx: usize, + _data: &[u8], + ) -> Option { + None + } + /// Reads from a BAR region mapped into the device. + /// * `addr` - The guest address inside the BAR. + /// * `data` - Filled with the data from `addr`. + fn read_bar(&mut self, _base: u64, _offset: u64, _data: &mut [u8]) {} + /// Writes to a BAR region mapped into the device. + /// * `addr` - The guest address inside the BAR. + /// * `data` - The data to write. + fn write_bar(&mut self, _base: u64, _offset: u64, _data: &[u8]) -> Option> { + None + } + /// Relocates the BAR to a different address in guest address space. + fn move_bar(&mut self, _old_base: u64, _new_base: u64) -> result::Result<(), io::Error> { + Ok(()) + } + /// Provides a mutable reference to the Any trait. This is useful to let + /// the caller have access to the underlying type behind the trait. + fn as_any_mut(&mut self) -> &mut dyn Any; + + /// Optionally returns a unique identifier. + fn id(&self) -> Option; +} + +/// This trait defines a set of functions which can be triggered whenever a +/// PCI device is modified in any way. +pub trait DeviceRelocation: Send + Sync { + /// The BAR needs to be moved to a different location in the guest address + /// space. This follows a decision from the software running in the guest. + fn move_bar( + &self, + old_base: u64, + new_base: u64, + len: u64, + pci_dev: &mut dyn PciDevice, + region_type: PciBarRegionType, + ) -> result::Result<(), io::Error>; +} diff --git a/src/pci/src/lib.rs b/src/pci/src/lib.rs new file mode 100644 index 00000000000..2672159e474 --- /dev/null +++ b/src/pci/src/lib.rs @@ -0,0 +1,198 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +//! Implements pci devices and busses. +#[macro_use] +extern crate log; + +mod bus; +mod configuration; +mod device; +mod msi; +mod msix; + +use std::fmt::{self, Debug, Display}; +use std::num::ParseIntError; +use std::str::FromStr; + +use serde::de::Visitor; + +pub use self::bus::{PciBus, PciConfigIo, PciConfigMmio, PciRoot, PciRootError}; +pub use self::configuration::{ + PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciCapability, PciCapabilityId, + PciClassCode, PciConfiguration, PciExpressCapabilityId, PciHeaderType, PciMassStorageSubclass, + PciNetworkControllerSubclass, PciProgrammingInterface, PciSerialBusSubClass, PciSubclass, + PCI_CONFIGURATION_ID, +}; +pub use self::device::{ + BarReprogrammingParams, DeviceRelocation, Error as PciDeviceError, PciDevice, +}; +pub use self::msi::{msi_num_enabled_vectors, MsiCap, MsiConfig}; +pub use self::msix::{MsixCap, MsixConfig, MsixTableEntry, MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE}; + +/// PCI has four interrupt pins A->D. +#[derive(Copy, Clone)] +pub enum PciInterruptPin { + IntA, + IntB, + IntC, + IntD, +} + +impl PciInterruptPin { + pub fn to_mask(self) -> u32 { + self as u32 + } +} + +#[cfg(target_arch = "x86_64")] +pub const PCI_CONFIG_IO_PORT: u64 = 0xcf8; +#[cfg(target_arch = "x86_64")] +pub const PCI_CONFIG_IO_PORT_SIZE: u64 = 0x8; + +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd)] +pub struct PciBdf(u32); + +struct PciBdfVisitor; + +impl Visitor<'_> for PciBdfVisitor { + type Value = PciBdf; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + formatter.write_str("struct PciBdf") + } + + fn visit_str(self, v: &str) -> Result + where + E: serde::de::Error, + { + Ok(v.into()) + } +} + +impl<'de> serde::Deserialize<'de> for PciBdf { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + deserializer.deserialize_str(PciBdfVisitor) + } +} + +impl serde::Serialize for PciBdf { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + serializer.collect_str(&self.to_string()) + } +} + +impl PciBdf { + pub fn segment(&self) -> u16 { + ((self.0 >> 16) & 0xffff) as u16 + } + + pub fn bus(&self) -> u8 { + ((self.0 >> 8) & 0xff) as u8 + } + + pub fn device(&self) -> u8 { + ((self.0 >> 3) & 0x1f) as u8 + } + + pub fn function(&self) -> u8 { + (self.0 & 0x7) as u8 + } + + pub fn new(segment: u16, bus: u8, device: u8, function: u8) -> Self { + Self( + ((segment as u32) << 16) + | ((bus as u32) << 8) + | (((device & 0x1f) as u32) << 3) + | (function & 0x7) as u32, + ) + } +} + +impl From for PciBdf { + fn from(bdf: u32) -> Self { + Self(bdf) + } +} + +impl From for u32 { + fn from(bdf: PciBdf) -> Self { + bdf.0 + } +} + +impl From<&PciBdf> for u32 { + fn from(bdf: &PciBdf) -> Self { + bdf.0 + } +} + +impl From for u16 { + fn from(bdf: PciBdf) -> Self { + (bdf.0 & 0xffff) as u16 + } +} + +impl From<&PciBdf> for u16 { + fn from(bdf: &PciBdf) -> Self { + (bdf.0 & 0xffff) as u16 + } +} + +impl Debug for PciBdf { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{:04x}:{:02x}:{:02x}.{:01x}", + self.segment(), + self.bus(), + self.device(), + self.function() + ) + } +} + +impl Display for PciBdf { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{:04x}:{:02x}:{:02x}.{:01x}", + self.segment(), + self.bus(), + self.device(), + self.function() + ) + } +} + +impl FromStr for PciBdf { + type Err = ParseIntError; + + fn from_str(s: &str) -> Result { + let items: Vec<&str> = s.split('.').collect(); + assert_eq!(items.len(), 2); + let function = u8::from_str_radix(items[1], 16)?; + let items: Vec<&str> = items[0].split(':').collect(); + assert_eq!(items.len(), 3); + let segment = u16::from_str_radix(items[0], 16)?; + let bus = u8::from_str_radix(items[1], 16)?; + let device = u8::from_str_radix(items[2], 16)?; + Ok(PciBdf::new(segment, bus, device, function)) + } +} + +impl From<&str> for PciBdf { + fn from(bdf: &str) -> Self { + Self::from_str(bdf).unwrap() + } +} diff --git a/src/pci/src/msi.rs b/src/pci/src/msi.rs new file mode 100644 index 00000000000..16d593cd115 --- /dev/null +++ b/src/pci/src/msi.rs @@ -0,0 +1,282 @@ +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause +// + +use std::io; +use std::sync::Arc; + +use byteorder::{ByteOrder, LittleEndian}; +use serde::{Deserialize, Serialize}; +use thiserror::Error; +use vm_device::interrupt::{ + InterruptIndex, InterruptSourceConfig, InterruptSourceGroup, MsiIrqSourceConfig, +}; + +// MSI control masks +const MSI_CTL_ENABLE: u16 = 0x1; +const MSI_CTL_MULTI_MSG_ENABLE: u16 = 0x70; +const MSI_CTL_64_BITS: u16 = 0x80; +const MSI_CTL_PER_VECTOR: u16 = 0x100; + +// MSI message offsets +const MSI_MSG_CTL_OFFSET: u64 = 0x2; +const MSI_MSG_ADDR_LO_OFFSET: u64 = 0x4; + +// MSI message masks +const MSI_MSG_ADDR_LO_MASK: u32 = 0xffff_fffc; + +pub fn msi_num_enabled_vectors(msg_ctl: u16) -> usize { + let field = (msg_ctl >> 4) & 0x7; + + if field > 5 { + return 0; + } + + 1 << field +} + +#[derive(Error, Debug)] +pub enum Error { + #[error("Failed enabling the interrupt route: {0}")] + EnableInterruptRoute(io::Error), + #[error("Failed updating the interrupt route: {0}")] + UpdateInterruptRoute(io::Error), +} + +#[derive(Clone, Copy, Default, Serialize, Deserialize)] +pub struct MsiCap { + // Message Control Register + // 0: MSI enable. + // 3-1; Multiple message capable. + // 6-4: Multiple message enable. + // 7: 64 bits address capable. + // 8: Per-vector masking capable. + // 15-9: Reserved. + pub msg_ctl: u16, + // Message Address (LSB) + // 1-0: Reserved. + // 31-2: Message address. + pub msg_addr_lo: u32, + // Message Upper Address (MSB) + // 31-0: Message address. + pub msg_addr_hi: u32, + // Message Data + // 15-0: Message data. + pub msg_data: u16, + // Mask Bits + // 31-0: Mask bits. + pub mask_bits: u32, + // Pending Bits + // 31-0: Pending bits. + pub pending_bits: u32, +} + +impl MsiCap { + fn addr_64_bits(&self) -> bool { + self.msg_ctl & MSI_CTL_64_BITS == MSI_CTL_64_BITS + } + + fn per_vector_mask(&self) -> bool { + self.msg_ctl & MSI_CTL_PER_VECTOR == MSI_CTL_PER_VECTOR + } + + fn enabled(&self) -> bool { + self.msg_ctl & MSI_CTL_ENABLE == MSI_CTL_ENABLE + } + + fn num_enabled_vectors(&self) -> usize { + msi_num_enabled_vectors(self.msg_ctl) + } + + fn vector_masked(&self, vector: usize) -> bool { + if !self.per_vector_mask() { + return false; + } + + (self.mask_bits >> vector) & 0x1 == 0x1 + } + + fn size(&self) -> u64 { + let mut size: u64 = 0xa; + + if self.addr_64_bits() { + size += 0x4; + } + if self.per_vector_mask() { + size += 0xa; + } + + size + } + + fn update(&mut self, offset: u64, data: &[u8]) { + // Calculate message data offset depending on the address being 32 or + // 64 bits. + // Calculate upper address offset if the address is 64 bits. + // Calculate mask bits offset based on the address being 32 or 64 bits + // and based on the per vector masking being enabled or not. + let (msg_data_offset, addr_hi_offset, mask_bits_offset): (u64, Option, Option) = + if self.addr_64_bits() { + let mask_bits = if self.per_vector_mask() { + Some(0x10) + } else { + None + }; + (0xc, Some(0x8), mask_bits) + } else { + let mask_bits = if self.per_vector_mask() { + Some(0xc) + } else { + None + }; + (0x8, None, mask_bits) + }; + + // Update cache without overriding the read-only bits. + match data.len() { + 2 => { + let value = LittleEndian::read_u16(data); + match offset { + MSI_MSG_CTL_OFFSET => { + self.msg_ctl = (self.msg_ctl & !(MSI_CTL_ENABLE | MSI_CTL_MULTI_MSG_ENABLE)) + | (value & (MSI_CTL_ENABLE | MSI_CTL_MULTI_MSG_ENABLE)) + } + x if x == msg_data_offset => self.msg_data = value, + _ => error!("invalid offset"), + } + } + 4 => { + let value = LittleEndian::read_u32(data); + match offset { + 0x0 => { + self.msg_ctl = (self.msg_ctl & !(MSI_CTL_ENABLE | MSI_CTL_MULTI_MSG_ENABLE)) + | ((value >> 16) as u16 & (MSI_CTL_ENABLE | MSI_CTL_MULTI_MSG_ENABLE)) + } + MSI_MSG_ADDR_LO_OFFSET => self.msg_addr_lo = value & MSI_MSG_ADDR_LO_MASK, + x if x == msg_data_offset => self.msg_data = value as u16, + x if addr_hi_offset.is_some() && x == addr_hi_offset.unwrap() => { + self.msg_addr_hi = value + } + x if mask_bits_offset.is_some() && x == mask_bits_offset.unwrap() => { + self.mask_bits = value + } + _ => error!("invalid offset"), + } + } + _ => error!("invalid data length"), + } + } +} + +#[derive(Serialize, Deserialize)] +pub struct MsiConfigState { + cap: MsiCap, +} + +pub struct MsiConfig { + pub cap: MsiCap, + interrupt_source_group: Arc, +} + +impl MsiConfig { + pub fn new( + msg_ctl: u16, + interrupt_source_group: Arc, + state: Option, + ) -> Result { + let cap = if let Some(state) = state { + if state.cap.enabled() { + for idx in 0..state.cap.num_enabled_vectors() { + let config = MsiIrqSourceConfig { + high_addr: state.cap.msg_addr_hi, + low_addr: state.cap.msg_addr_lo, + data: state.cap.msg_data as u32, + devid: 0, + }; + + interrupt_source_group + .update( + idx as InterruptIndex, + InterruptSourceConfig::MsiIrq(config), + state.cap.vector_masked(idx), + false, + ) + .map_err(Error::UpdateInterruptRoute)?; + } + + interrupt_source_group + .set_gsi() + .map_err(Error::EnableInterruptRoute)?; + + interrupt_source_group + .enable() + .map_err(Error::EnableInterruptRoute)?; + } + + state.cap + } else { + MsiCap { + msg_ctl, + ..Default::default() + } + }; + + Ok(MsiConfig { + cap, + interrupt_source_group, + }) + } + + pub fn state(&self) -> MsiConfigState { + MsiConfigState { cap: self.cap } + } + + pub fn enabled(&self) -> bool { + self.cap.enabled() + } + + pub fn size(&self) -> u64 { + self.cap.size() + } + + pub fn num_enabled_vectors(&self) -> usize { + self.cap.num_enabled_vectors() + } + + pub fn update(&mut self, offset: u64, data: &[u8]) { + let old_enabled = self.cap.enabled(); + + self.cap.update(offset, data); + + if self.cap.enabled() { + for idx in 0..self.num_enabled_vectors() { + let config = MsiIrqSourceConfig { + high_addr: self.cap.msg_addr_hi, + low_addr: self.cap.msg_addr_lo, + data: self.cap.msg_data as u32, + devid: 0, + }; + + if let Err(e) = self.interrupt_source_group.update( + idx as InterruptIndex, + InterruptSourceConfig::MsiIrq(config), + self.cap.vector_masked(idx), + true, + ) { + error!("Failed updating vector: {:?}", e); + } + } + + if !old_enabled { + if let Err(e) = self.interrupt_source_group.enable() { + error!("Failed enabling irq_fd: {:?}", e); + } + } + } else if old_enabled { + if let Err(e) = self.interrupt_source_group.disable() { + error!("Failed disabling irq_fd: {:?}", e); + } + } + } +} diff --git a/src/pci/src/msix.rs b/src/pci/src/msix.rs new file mode 100644 index 00000000000..4b3cf688980 --- /dev/null +++ b/src/pci/src/msix.rs @@ -0,0 +1,552 @@ +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause +// + +use std::sync::Arc; +use std::{io, result}; + +use byteorder::{ByteOrder, LittleEndian}; +use serde::{Deserialize, Serialize}; +use vm_device::interrupt::{ + InterruptIndex, InterruptSourceConfig, InterruptSourceGroup, MsiIrqSourceConfig, +}; +use vm_memory::ByteValued; + +use crate::{PciCapability, PciCapabilityId}; + +const MAX_MSIX_VECTORS_PER_DEVICE: u16 = 2048; +const MSIX_TABLE_ENTRIES_MODULO: u64 = 16; +const MSIX_PBA_ENTRIES_MODULO: u64 = 8; +const BITS_PER_PBA_ENTRY: usize = 64; +const FUNCTION_MASK_BIT: u8 = 14; +const MSIX_ENABLE_BIT: u8 = 15; +const FUNCTION_MASK_MASK: u16 = (1 << FUNCTION_MASK_BIT) as u16; +const MSIX_ENABLE_MASK: u16 = (1 << MSIX_ENABLE_BIT) as u16; +pub const MSIX_TABLE_ENTRY_SIZE: usize = 16; +pub const MSIX_CONFIG_ID: &str = "msix_config"; + +#[derive(Debug)] +pub enum Error { + /// Failed enabling the interrupt route. + EnableInterruptRoute(io::Error), + /// Failed updating the interrupt route. + UpdateInterruptRoute(io::Error), +} + +#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] +pub struct MsixTableEntry { + pub msg_addr_lo: u32, + pub msg_addr_hi: u32, + pub msg_data: u32, + pub vector_ctl: u32, +} + +impl MsixTableEntry { + pub fn masked(&self) -> bool { + self.vector_ctl & 0x1 == 0x1 + } +} + +impl Default for MsixTableEntry { + fn default() -> Self { + MsixTableEntry { + msg_addr_lo: 0, + msg_addr_hi: 0, + msg_data: 0, + vector_ctl: 0x1, + } + } +} + +#[derive(Serialize, Deserialize)] +pub struct MsixConfigState { + table_entries: Vec, + pba_entries: Vec, + masked: bool, + enabled: bool, +} + +pub struct MsixConfig { + pub table_entries: Vec, + pub pba_entries: Vec, + pub devid: u32, + interrupt_source_group: Arc, + masked: bool, + enabled: bool, +} + +impl MsixConfig { + pub fn new( + msix_vectors: u16, + interrupt_source_group: Arc, + devid: u32, + state: Option, + ) -> result::Result { + assert!(msix_vectors <= MAX_MSIX_VECTORS_PER_DEVICE); + + let (table_entries, pba_entries, masked, enabled) = if let Some(state) = state { + if state.enabled && !state.masked { + for (idx, table_entry) in state.table_entries.iter().enumerate() { + if table_entry.masked() { + continue; + } + + let config = MsiIrqSourceConfig { + high_addr: table_entry.msg_addr_hi, + low_addr: table_entry.msg_addr_lo, + data: table_entry.msg_data, + devid, + }; + + interrupt_source_group + .update( + idx as InterruptIndex, + InterruptSourceConfig::MsiIrq(config), + state.masked, + true, + ) + .map_err(Error::UpdateInterruptRoute)?; + + interrupt_source_group + .enable() + .map_err(Error::EnableInterruptRoute)?; + } + } + + ( + state.table_entries, + state.pba_entries, + state.masked, + state.enabled, + ) + } else { + let mut table_entries: Vec = Vec::new(); + table_entries.resize_with(msix_vectors as usize, Default::default); + let mut pba_entries: Vec = Vec::new(); + let num_pba_entries: usize = ((msix_vectors as usize) / BITS_PER_PBA_ENTRY) + 1; + pba_entries.resize_with(num_pba_entries, Default::default); + + (table_entries, pba_entries, true, false) + }; + + Ok(MsixConfig { + table_entries, + pba_entries, + devid, + interrupt_source_group, + masked, + enabled, + }) + } + + pub fn state(&self) -> MsixConfigState { + MsixConfigState { + table_entries: self.table_entries.clone(), + pba_entries: self.pba_entries.clone(), + masked: self.masked, + enabled: self.enabled, + } + } + + pub fn masked(&self) -> bool { + self.masked + } + + pub fn enabled(&self) -> bool { + self.enabled + } + + pub fn set_msg_ctl(&mut self, reg: u16) { + let old_masked = self.masked; + let old_enabled = self.enabled; + + self.masked = ((reg >> FUNCTION_MASK_BIT) & 1u16) == 1u16; + self.enabled = ((reg >> MSIX_ENABLE_BIT) & 1u16) == 1u16; + + // Update interrupt routing + if old_masked != self.masked || old_enabled != self.enabled { + if self.enabled && !self.masked { + debug!("MSI-X enabled for device 0x{:x}", self.devid); + for (idx, table_entry) in self.table_entries.iter().enumerate() { + let config = MsiIrqSourceConfig { + high_addr: table_entry.msg_addr_hi, + low_addr: table_entry.msg_addr_lo, + data: table_entry.msg_data, + devid: self.devid, + }; + + if let Err(e) = self.interrupt_source_group.update( + idx as InterruptIndex, + InterruptSourceConfig::MsiIrq(config), + table_entry.masked(), + true, + ) { + error!("Failed updating vector: {:?}", e); + } + } + } else if old_enabled || !old_masked { + debug!("MSI-X disabled for device 0x{:x}", self.devid); + if let Err(e) = self.interrupt_source_group.disable() { + error!("Failed disabling irq_fd: {:?}", e); + } + } + } + + // If the Function Mask bit was set, and has just been cleared, it's + // important to go through the entire PBA to check if there was any + // pending MSI-X message to inject, given that the vector is not + // masked. + if old_masked && !self.masked { + for (index, entry) in self.table_entries.clone().iter().enumerate() { + if !entry.masked() && self.get_pba_bit(index as u16) == 1 { + self.inject_msix_and_clear_pba(index); + } + } + } + } + + pub fn read_table(&self, offset: u64, data: &mut [u8]) { + assert!((data.len() == 4 || data.len() == 8)); + + let index: usize = (offset / MSIX_TABLE_ENTRIES_MODULO) as usize; + let modulo_offset = offset % MSIX_TABLE_ENTRIES_MODULO; + + if index >= self.table_entries.len() { + debug!("Invalid MSI-X table entry index {index}"); + data.copy_from_slice(&[0xff; 8][..data.len()]); + return; + } + + match data.len() { + 4 => { + let value = match modulo_offset { + 0x0 => self.table_entries[index].msg_addr_lo, + 0x4 => self.table_entries[index].msg_addr_hi, + 0x8 => self.table_entries[index].msg_data, + 0xc => self.table_entries[index].vector_ctl, + _ => { + error!("invalid offset"); + 0 + } + }; + + debug!("MSI_R TABLE offset 0x{:x} data 0x{:x}", offset, value); + LittleEndian::write_u32(data, value); + } + 8 => { + let value = match modulo_offset { + 0x0 => { + (u64::from(self.table_entries[index].msg_addr_hi) << 32) + | u64::from(self.table_entries[index].msg_addr_lo) + } + 0x8 => { + (u64::from(self.table_entries[index].vector_ctl) << 32) + | u64::from(self.table_entries[index].msg_data) + } + _ => { + error!("invalid offset"); + 0 + } + }; + + debug!("MSI_R TABLE offset 0x{:x} data 0x{:x}", offset, value); + LittleEndian::write_u64(data, value); + } + _ => { + error!("invalid data length"); + } + } + } + + pub fn write_table(&mut self, offset: u64, data: &[u8]) { + assert!((data.len() == 4 || data.len() == 8)); + + let index: usize = (offset / MSIX_TABLE_ENTRIES_MODULO) as usize; + let modulo_offset = offset % MSIX_TABLE_ENTRIES_MODULO; + + if index >= self.table_entries.len() { + debug!("Invalid MSI-X table entry index {index}"); + return; + } + + // Store the value of the entry before modification + let old_entry = self.table_entries[index].clone(); + + match data.len() { + 4 => { + let value = LittleEndian::read_u32(data); + match modulo_offset { + 0x0 => self.table_entries[index].msg_addr_lo = value, + 0x4 => self.table_entries[index].msg_addr_hi = value, + 0x8 => self.table_entries[index].msg_data = value, + 0xc => { + self.table_entries[index].vector_ctl = value; + } + _ => error!("invalid offset"), + }; + + debug!("MSI_W TABLE offset 0x{:x} data 0x{:x}", offset, value); + } + 8 => { + let value = LittleEndian::read_u64(data); + match modulo_offset { + 0x0 => { + self.table_entries[index].msg_addr_lo = (value & 0xffff_ffffu64) as u32; + self.table_entries[index].msg_addr_hi = (value >> 32) as u32; + } + 0x8 => { + self.table_entries[index].msg_data = (value & 0xffff_ffffu64) as u32; + self.table_entries[index].vector_ctl = (value >> 32) as u32; + } + _ => error!("invalid offset"), + }; + + debug!("MSI_W TABLE offset 0x{:x} data 0x{:x}", offset, value); + } + _ => error!("invalid data length"), + }; + + let table_entry = &self.table_entries[index]; + + // Optimisation to avoid excessive updates + if &old_entry == table_entry { + return; + } + + // Update interrupt routes + // Optimisation: only update routes if the entry is not masked; + // this is safe because if the entry is masked (starts masked as per spec) + // in the table then it won't be triggered. (See: #4273) + if self.enabled && !self.masked && !table_entry.masked() { + let config = MsiIrqSourceConfig { + high_addr: table_entry.msg_addr_hi, + low_addr: table_entry.msg_addr_lo, + data: table_entry.msg_data, + devid: self.devid, + }; + + if let Err(e) = self.interrupt_source_group.update( + index as InterruptIndex, + InterruptSourceConfig::MsiIrq(config), + table_entry.masked(), + true, + ) { + error!("Failed updating vector: {:?}", e); + } + } + + // After the MSI-X table entry has been updated, it is necessary to + // check if the vector control masking bit has changed. In case the + // bit has been flipped from 1 to 0, we need to inject a MSI message + // if the corresponding pending bit from the PBA is set. Once the MSI + // has been injected, the pending bit in the PBA needs to be cleared. + // All of this is valid only if MSI-X has not been masked for the whole + // device. + + // Check if bit has been flipped + if !self.masked() + && self.enabled() + && old_entry.masked() + && !table_entry.masked() + && self.get_pba_bit(index as u16) == 1 + { + self.inject_msix_and_clear_pba(index); + } + } + + pub fn read_pba(&mut self, offset: u64, data: &mut [u8]) { + assert!((data.len() == 4 || data.len() == 8)); + + let index: usize = (offset / MSIX_PBA_ENTRIES_MODULO) as usize; + let modulo_offset = offset % MSIX_PBA_ENTRIES_MODULO; + + if index >= self.pba_entries.len() { + debug!("Invalid MSI-X PBA entry index {index}"); + data.copy_from_slice(&[0xff; 8][..data.len()]); + return; + } + + match data.len() { + 4 => { + let value: u32 = match modulo_offset { + 0x0 => (self.pba_entries[index] & 0xffff_ffffu64) as u32, + 0x4 => (self.pba_entries[index] >> 32) as u32, + _ => { + error!("invalid offset"); + 0 + } + }; + + debug!("MSI_R PBA offset 0x{:x} data 0x{:x}", offset, value); + LittleEndian::write_u32(data, value); + } + 8 => { + let value: u64 = match modulo_offset { + 0x0 => self.pba_entries[index], + _ => { + error!("invalid offset"); + 0 + } + }; + + debug!("MSI_R PBA offset 0x{:x} data 0x{:x}", offset, value); + LittleEndian::write_u64(data, value); + } + _ => { + error!("invalid data length"); + } + } + } + + pub fn write_pba(&mut self, _offset: u64, _data: &[u8]) { + error!("Pending Bit Array is read only"); + } + + pub fn set_pba_bit(&mut self, vector: u16, reset: bool) { + assert!(vector < MAX_MSIX_VECTORS_PER_DEVICE); + + let index: usize = (vector as usize) / BITS_PER_PBA_ENTRY; + let shift: usize = (vector as usize) % BITS_PER_PBA_ENTRY; + let mut mask: u64 = (1 << shift) as u64; + + if reset { + mask = !mask; + self.pba_entries[index] &= mask; + } else { + self.pba_entries[index] |= mask; + } + } + + fn get_pba_bit(&self, vector: u16) -> u8 { + assert!(vector < MAX_MSIX_VECTORS_PER_DEVICE); + + let index: usize = (vector as usize) / BITS_PER_PBA_ENTRY; + let shift: usize = (vector as usize) % BITS_PER_PBA_ENTRY; + + ((self.pba_entries[index] >> shift) & 0x0000_0001u64) as u8 + } + + fn inject_msix_and_clear_pba(&mut self, vector: usize) { + // Inject the MSI message + match self + .interrupt_source_group + .trigger(vector as InterruptIndex) + { + Ok(_) => debug!("MSI-X injected on vector control flip"), + Err(e) => error!("failed to inject MSI-X: {}", e), + } + + // Clear the bit from PBA + self.set_pba_bit(vector as u16, true); + } +} + +#[allow(dead_code)] +#[repr(C, packed)] +#[derive(Clone, Copy, Default, Serialize, Deserialize)] +pub struct MsixCap { + // Message Control Register + // 10-0: MSI-X Table size + // 13-11: Reserved + // 14: Mask. Mask all MSI-X when set. + // 15: Enable. Enable all MSI-X when set. + pub msg_ctl: u16, + // Table. Contains the offset and the BAR indicator (BIR) + // 2-0: Table BAR indicator (BIR). Can be 0 to 5. + // 31-3: Table offset in the BAR pointed by the BIR. + pub table: u32, + // Pending Bit Array. Contains the offset and the BAR indicator (BIR) + // 2-0: PBA BAR indicator (BIR). Can be 0 to 5. + // 31-3: PBA offset in the BAR pointed by the BIR. + pub pba: u32, +} + +// SAFETY: All members are simple numbers and any value is valid. +unsafe impl ByteValued for MsixCap {} + +impl PciCapability for MsixCap { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::MsiX + } +} + +impl MsixCap { + pub fn new( + table_pci_bar: u8, + table_size: u16, + table_off: u32, + pba_pci_bar: u8, + pba_off: u32, + ) -> Self { + assert!(table_size < MAX_MSIX_VECTORS_PER_DEVICE); + + // Set the table size and enable MSI-X. + let msg_ctl: u16 = 0x8000u16 + table_size - 1; + + MsixCap { + msg_ctl, + table: (table_off & 0xffff_fff8u32) | u32::from(table_pci_bar & 0x7u8), + pba: (pba_off & 0xffff_fff8u32) | u32::from(pba_pci_bar & 0x7u8), + } + } + + pub fn set_msg_ctl(&mut self, data: u16) { + self.msg_ctl = (self.msg_ctl & !(FUNCTION_MASK_MASK | MSIX_ENABLE_MASK)) + | (data & (FUNCTION_MASK_MASK | MSIX_ENABLE_MASK)); + } + + pub fn masked(&self) -> bool { + (self.msg_ctl >> FUNCTION_MASK_BIT) & 0x1 == 0x1 + } + + pub fn enabled(&self) -> bool { + (self.msg_ctl >> MSIX_ENABLE_BIT) & 0x1 == 0x1 + } + + pub fn table_offset(&self) -> u32 { + self.table & 0xffff_fff8 + } + + pub fn pba_offset(&self) -> u32 { + self.pba & 0xffff_fff8 + } + + pub fn table_set_offset(&mut self, addr: u32) { + self.table &= 0x7; + self.table += addr; + } + + pub fn pba_set_offset(&mut self, addr: u32) { + self.pba &= 0x7; + self.pba += addr; + } + + pub fn table_bir(&self) -> u32 { + self.table & 0x7 + } + + pub fn pba_bir(&self) -> u32 { + self.pba & 0x7 + } + + pub fn table_size(&self) -> u16 { + (self.msg_ctl & 0x7ff) + 1 + } + + pub fn table_range(&self) -> (u64, u64) { + // The table takes 16 bytes per entry. + let size = self.table_size() as u64 * 16; + (self.table_offset() as u64, size) + } + + pub fn pba_range(&self) -> (u64, u64) { + // The table takes 1 bit per entry modulo 8 bytes. + let size = ((self.table_size() as u64 / 64) + 1) * 8; + (self.pba_offset() as u64, size) + } +} diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index 995f314e46f..bee0f88efa8 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -14,6 +14,7 @@ tracing = ["log-instrument"] gdb = ["arrayvec", "gdbstub", "gdbstub_arch"] [dependencies] + acpi_tables = { path = "../acpi-tables" } aes-gcm = { version = "0.10.1", default-features = false, features = ["aes"] } arrayvec = { version = "0.7.6", optional = true } @@ -38,6 +39,7 @@ log = { version = "0.4.27", features = ["std", "serde"] } log-instrument = { path = "../log-instrument", optional = true } memfd = "0.6.3" micro_http = { git = "https://github.com/firecracker-microvm/micro-http" } +pci = { path = "../pci" } semver = { version = "1.0.26", features = ["serde"] } serde = { version = "1.0.219", features = ["derive", "rc"] } serde_json = "1.0.140" @@ -46,8 +48,10 @@ thiserror = "2.0.12" timerfd = "1.5.0" userfaultfd = "0.8.1" utils = { path = "../utils" } +uuid = "1.16.0" vhost = { version = "0.14.0", features = ["vhost-user-frontend"] } vm-allocator = "0.1.3" +vm-device = { path = "../vm-device" } vm-memory = { version = "0.16.2", features = [ "backend-mmap", "backend-bitmap", diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index d9ba26015b6..09c026df610 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -450,7 +450,7 @@ impl MMIODeviceManager { // Stats queue doesn't need kicking as it is notified via a `timer_fd`. if balloon.is_activated() { info!("kick balloon {}.", id); - balloon.process_virtio_queues(); + balloon.process_virtio_queues().unwrap(); } } TYPE_BLOCK => { diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index b66228aeebe..b8e336f1dad 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -173,7 +173,7 @@ pub fn create_snapshot( .mmio_devices .for_each_virtio_device(|_, _, device| { let mmio_dev_locked = device.inner.lock().expect("Poisoned lock"); - let d = mmio_dev_locked.locked_device(); + let mut d = mmio_dev_locked.locked_device(); if d.is_activated() { d.mark_queue_memory_dirty(vmm.vm.guest_memory()) } else { From 64738fb2038acb85a406c49184ab87f0544245c8 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Mon, 5 May 2025 17:52:08 +0200 Subject: [PATCH 15/56] arch: define 64-bit capable MMIO memory regions PCIe distinguishes MMIO regions between 32bit and 64bit, caring for devices that can't deal with 64-bit addresses. This commit defines the appropriate regions for both x86 and aarch64 architectures, extends the resource allocator to handle allocations for both of these regions and adjusts the logic that calculates the memory regions for the architecture. Also, un-do the change that added an `offset` argument `arch_memory_regions` function. We won't be using this for "secret hiding" so it just made the logic (especially for kani proofs) too convoluted. Signed-off-by: Babis Chalios --- src/vmm/src/arch/aarch64/gic/gicv2/mod.rs | 2 +- src/vmm/src/arch/aarch64/gic/gicv3/mod.rs | 2 +- src/vmm/src/arch/aarch64/layout.rs | 69 ++++- src/vmm/src/arch/aarch64/mod.rs | 144 ++++++---- src/vmm/src/arch/mod.rs | 48 +++- src/vmm/src/arch/x86_64/layout.rs | 45 ++++ src/vmm/src/arch/x86_64/mod.rs | 251 ++++++++++-------- src/vmm/src/builder.rs | 10 +- src/vmm/src/device_manager/mmio.rs | 31 ++- src/vmm/src/device_manager/mod.rs | 7 +- src/vmm/src/device_manager/persist.rs | 23 +- src/vmm/src/device_manager/resources.rs | 45 +++- .../src/devices/virtio/vsock/event_handler.rs | 16 +- src/vmm/src/resources.rs | 2 +- src/vmm/src/test_utils/mod.rs | 4 +- 15 files changed, 453 insertions(+), 246 deletions(-) diff --git a/src/vmm/src/arch/aarch64/gic/gicv2/mod.rs b/src/vmm/src/arch/aarch64/gic/gicv2/mod.rs index 22aaa4b4b74..c4b9208a0a6 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv2/mod.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv2/mod.rs @@ -30,7 +30,7 @@ impl GICv2 { /// Get the address of the GICv2 distributor. const fn get_dist_addr() -> u64 { - super::layout::MAPPED_IO_START - GICv2::KVM_VGIC_V2_DIST_SIZE + super::layout::MMIO32_MEM_START - GICv2::KVM_VGIC_V2_DIST_SIZE } /// Get the size of the GIC_v2 distributor. diff --git a/src/vmm/src/arch/aarch64/gic/gicv3/mod.rs b/src/vmm/src/arch/aarch64/gic/gicv3/mod.rs index 558b47ab065..39c4e5ce148 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv3/mod.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv3/mod.rs @@ -30,7 +30,7 @@ impl GICv3 { /// Get the address of the GIC distributor. fn get_dist_addr() -> u64 { - super::layout::MAPPED_IO_START - GICv3::KVM_VGIC_V3_DIST_SIZE + super::layout::MMIO32_MEM_START - GICv3::KVM_VGIC_V3_DIST_SIZE } /// Get the size of the GIC distributor. diff --git a/src/vmm/src/arch/aarch64/layout.rs b/src/vmm/src/arch/aarch64/layout.rs index 922cfbb66e6..bdecc712696 100644 --- a/src/vmm/src/arch/aarch64/layout.rs +++ b/src/vmm/src/arch/aarch64/layout.rs @@ -4,51 +4,53 @@ // ==== Address map in use in ARM development systems today ==== // // - 32-bit - - 36-bit - - 40-bit - -// 1024GB + + +-------------------+ <- 40-bit +// 1024GB + + +-------------------+ <- 40-bit // | | DRAM | // ~ ~ ~ ~ // | | | // | | | // | | | // | | | -// 544GB + + +-------------------+ +// 544GB + + +-------------------+ // | | Hole or DRAM | // | | | -// 512GB + + +-------------------+ +// 512GB + + +-------------------+ // | | Mapped | // | | I/O | // ~ ~ ~ ~ // | | | -// 256GB + + +-------------------+ +// 256GB + + +-------------------+ // | | Reserved | // ~ ~ ~ ~ // | | | -// 64GB + +-----------------------+-------------------+ <- 36-bit +// 64GB + +-----------------------+-------------------+ <- 36-bit // | | DRAM | // ~ ~ ~ ~ // | | | // | | | -// 34GB + +-----------------------+-------------------+ +// 34GB + +-----------------------+-------------------+ // | | Hole or DRAM | -// 32GB + +-----------------------+-------------------+ +// 32GB + +-----------------------+-------------------+ // | | Mapped I/O | // ~ ~ ~ ~ // | | | -// 16GB + +-----------------------+-------------------+ +// 16GB + +-----------------------+-------------------+ // | | Reserved | // ~ ~ ~ ~ -// 4GB +-------------------+-----------------------+-------------------+ <- 32-bit +// 4GB +-------------------+-----------------------+-------------------+ <- 32-bit // | 2GB of DRAM | // | | -// 2GB +-------------------+-----------------------+-------------------+ +// 2GB +-------------------+-----------------------+-------------------+ // | Mapped I/O | -// 1GB +-------------------+-----------------------+-------------------+ +// 1GB +-------------------+-----------------------+-------------------+ // | ROM & RAM & I/O | -// 0GB +-------------------+-----------------------+-------------------+ 0 +// 0GB +-------------------+-----------------------+-------------------+ 0 // - 32-bit - - 36-bit - - 40-bit - // // Taken from (http://infocenter.arm.com/help/topic/com.arm.doc.den0001c/DEN0001C_principles_of_arm_memory_maps.pdf). +use crate::device_manager::mmio::MMIO_LEN; + /// Start of RAM on 64 bit ARM. pub const DRAM_MEM_START: u64 = 0x8000_0000; // 2 GB. /// The maximum RAM size. @@ -80,5 +82,46 @@ pub const IRQ_MAX: u32 = 128; /// First usable interrupt on aarch64. pub const IRQ_BASE: u32 = 32; +/// The start of the memory area reserved for MMIO 32-bit accesses. /// Below this address will reside the GIC, above this address will reside the MMIO devices. -pub const MAPPED_IO_START: u64 = 1 << 30; // 1 GB +pub const MMIO32_MEM_START: u64 = 1 << 30; // 1GiB +/// The size of the memory area reserved for MMIO 32-bit accesses (1GiB). +pub const MMIO32_MEM_SIZE: u64 = DRAM_MEM_START - MMIO32_MEM_START; + +// The rest of the MMIO address space (256 MiB) we dedicate to PCIe for memory-mapped access to +// configuration. +/// Size of MMIO region for PCIe configuration accesses. +pub const PCI_MMCONFIG_SIZE: u64 = 256 << 20; +/// Start of MMIO region for PCIe configuration accesses. +pub const PCI_MMCONFIG_START: u64 = DRAM_MEM_START - PCI_MMCONFIG_SIZE; +/// MMIO space per PCIe segment +pub const PCI_MMIO_CONFIG_SIZE_PER_SEGMENT: u64 = 4096 * 256; + +// We reserve 768 MiB for devices at the beginning of the MMIO region. This includes space both for +// pure MMIO and PCIe devices. + +/// Memory region start for boot device. +pub const BOOT_DEVICE_MEM_START: u64 = MMIO32_MEM_START; +/// Memory region start for RTC device. +pub const RTC_MEM_START: u64 = BOOT_DEVICE_MEM_START + MMIO_LEN; +/// Memory region start for Serial device. +pub const SERIAL_MEM_START: u64 = RTC_MEM_START + MMIO_LEN; + +/// Beginning of memory region for device MMIO 32-bit accesses +pub const MEM_32BIT_DEVICES_START: u64 = SERIAL_MEM_START + MMIO_LEN; +/// Size of memory region for device MMIO 32-bit accesses +pub const MEM_32BIT_DEVICES_SIZE: u64 = PCI_MMCONFIG_START - MEM_32BIT_DEVICES_START; + +// 64-bits region for MMIO accesses +/// The start of the memory area reserved for MMIO 64-bit accesses. +pub const MMIO64_MEM_START: u64 = 256 << 30; +/// The size of the memory area reserved for MMIO 64-bit accesses. +pub const MMIO64_MEM_SIZE: u64 = 256 << 30; + +// At the moment, all of this region goes to devices +/// Beginning of memory region for device MMIO 64-bit accesses +pub const MEM_64BIT_DEVICES_START: u64 = MMIO64_MEM_START; +/// Size of memory region for device MMIO 32-bit accesses +pub const MEM_64BIT_DEVICES_SIZE: u64 = MMIO64_MEM_SIZE; +/// First address past the 64-bit MMIO gap +pub const FIRST_ADDR_PAST_64BITS_MMIO: u64 = MMIO64_MEM_START + MMIO64_MEM_SIZE; diff --git a/src/vmm/src/arch/aarch64/mod.rs b/src/vmm/src/arch/aarch64/mod.rs index 6d1d0e26359..df6e712dcf5 100644 --- a/src/vmm/src/arch/aarch64/mod.rs +++ b/src/vmm/src/arch/aarch64/mod.rs @@ -24,11 +24,11 @@ use linux_loader::loader::pe::PE as Loader; use linux_loader::loader::{Cmdline, KernelLoader}; use vm_memory::GuestMemoryError; -use crate::arch::{BootProtocol, EntryPoint}; +use crate::arch::{BootProtocol, EntryPoint, arch_memory_regions_with_gap}; use crate::cpu_config::aarch64::{CpuConfiguration, CpuConfigurationError}; use crate::cpu_config::templates::CustomCpuTemplate; use crate::initrd::InitrdConfig; -use crate::utils::{align_up, usize_to_u64}; +use crate::utils::{align_up, u64_to_usize, usize_to_u64}; use crate::vmm_config::machine_config::MachineConfig; use crate::vstate::memory::{Address, Bytes, GuestAddress, GuestMemory, GuestMemoryMmap}; use crate::vstate::vcpu::KvmVcpuError; @@ -51,42 +51,34 @@ pub enum ConfigurationError { VcpuConfigure(#[from] KvmVcpuError), } -/// The start of the memory area reserved for MMIO devices. -pub const MMIO_MEM_START: u64 = layout::MAPPED_IO_START; -/// The size of the memory area reserved for MMIO devices. -pub const MMIO_MEM_SIZE: u64 = layout::DRAM_MEM_START - layout::MAPPED_IO_START; //>> 1GB - /// Returns a Vec of the valid memory addresses for aarch64. /// See [`layout`](layout) module for a drawing of the specific memory model for this platform. -/// -/// The `offset` parameter specified the offset from [`layout::DRAM_MEM_START`]. -pub fn arch_memory_regions(offset: usize, size: usize) -> Vec<(GuestAddress, usize)> { +pub fn arch_memory_regions(size: usize) -> Vec<(GuestAddress, usize)> { assert!(size > 0, "Attempt to allocate guest memory of length 0"); - assert!( - offset.checked_add(size).is_some(), - "Attempt to allocate guest memory such that the address space would wrap around" - ); - assert!( - offset < layout::DRAM_MEM_MAX_SIZE, - "offset outside allowed DRAM range" - ); - let dram_size = min(size, layout::DRAM_MEM_MAX_SIZE - offset); + let dram_size = min(size, layout::DRAM_MEM_MAX_SIZE); if dram_size != size { logger::warn!( - "Requested offset/memory size {}/{} exceeds architectural maximum (1022GiB). Size has \ - been truncated to {}", - offset, + "Requested memory size {} exceeds architectural maximum (1022GiB). Size has been \ + truncated to {}", size, dram_size ); } - vec![( - GuestAddress(layout::DRAM_MEM_START + offset as u64), + let mut regions = vec![]; + if let Some((offset, remaining)) = arch_memory_regions_with_gap( + &mut regions, + u64_to_usize(layout::DRAM_MEM_START), dram_size, - )] + u64_to_usize(layout::MMIO64_MEM_START), + u64_to_usize(layout::MMIO64_MEM_SIZE), + ) { + regions.push((GuestAddress(offset as u64), remaining)); + } + + regions } /// Configures the system for booting Linux. @@ -211,39 +203,66 @@ pub fn load_kernel( #[cfg(kani)] mod verification { - use vm_memory::GuestAddress; - - use crate::arch::aarch64::layout; + use crate::arch::aarch64::layout::{ + DRAM_MEM_MAX_SIZE, DRAM_MEM_START, FIRST_ADDR_PAST_64BITS_MMIO, MMIO64_MEM_START, + }; use crate::arch::arch_memory_regions; #[kani::proof] #[kani::unwind(3)] fn verify_arch_memory_regions() { - let offset: u64 = kani::any::(); - let len: u64 = kani::any::(); - + let len: usize = kani::any::(); kani::assume(len > 0); - kani::assume(offset.checked_add(len).is_some()); - kani::assume(offset < layout::DRAM_MEM_MAX_SIZE as u64); - let regions = arch_memory_regions(offset as usize, len as usize); + let regions = arch_memory_regions(len); - // No MMIO gap on ARM - assert_eq!(regions.len(), 1); + for region in ®ions { + println!( + "region: [{:x}:{:x})", + region.0.0, + region.0.0 + region.1 as u64 + ); + } - let (GuestAddress(start), actual_len) = regions[0]; - let actual_len = actual_len as u64; + // On Arm we have one MMIO gap that might fall within addressable ranges, + // so we can get either 1 or 2 regions. + assert!(regions.len() >= 1); + assert!(regions.len() <= 2); - assert_eq!(start, layout::DRAM_MEM_START + offset); - assert!(actual_len <= layout::DRAM_MEM_MAX_SIZE as u64); + // The total length of all regions cannot exceed DRAM_MEM_MAX_SIZE + let actual_len = regions.iter().map(|&(_, len)| len).sum::(); + assert!(actual_len <= DRAM_MEM_MAX_SIZE); + // The total length is smaller or equal to the length we asked assert!(actual_len <= len); + // If it's smaller, it's because we asked more than the the maximum possible. + if (actual_len) < len { + assert!(len > DRAM_MEM_MAX_SIZE); + } - if actual_len < len { - assert_eq!( - start + actual_len, - layout::DRAM_MEM_START + layout::DRAM_MEM_MAX_SIZE as u64 - ); - assert!(offset + len >= layout::DRAM_MEM_MAX_SIZE as u64); + // No region overlaps the 64-bit MMIO gap + assert!( + regions + .iter() + .all(|&(start, len)| start.0 >= FIRST_ADDR_PAST_64BITS_MMIO + || start.0 + len as u64 <= MMIO64_MEM_START) + ); + + // All regions start after our DRAM_MEM_START + assert!(regions.iter().all(|&(start, _)| start.0 >= DRAM_MEM_START)); + + // All regions have non-zero length + assert!(regions.iter().all(|&(_, len)| len > 0)); + + // If there's two regions, they perfectly snuggle up the 64bit MMIO gap + if regions.len() == 2 { + kani::cover!(); + + // The very first address should be DRAM_MEM_START + assert_eq!(regions[0].0.0, DRAM_MEM_START); + // The first region ends at the beginning of the 64 bits gap. + assert_eq!(regions[0].0.0 + regions[0].1 as u64, MMIO64_MEM_START); + // The second region starts exactly after the 64 bits gap. + assert_eq!(regions[1].0.0, FIRST_ADDR_PAST_64BITS_MMIO); } } } @@ -251,33 +270,42 @@ mod verification { #[cfg(test)] mod tests { use super::*; + use crate::arch::aarch64::layout::{ + DRAM_MEM_MAX_SIZE, DRAM_MEM_START, FDT_MAX_SIZE, FIRST_ADDR_PAST_64BITS_MMIO, + MMIO64_MEM_START, + }; use crate::test_utils::arch_mem; #[test] fn test_regions_lt_1024gb() { - let regions = arch_memory_regions(0, 1usize << 29); + let regions = arch_memory_regions(1usize << 29); assert_eq!(1, regions.len()); - assert_eq!(GuestAddress(super::layout::DRAM_MEM_START), regions[0].0); + assert_eq!(GuestAddress(DRAM_MEM_START), regions[0].0); assert_eq!(1usize << 29, regions[0].1); } #[test] fn test_regions_gt_1024gb() { - let regions = arch_memory_regions(0, 1usize << 41); - assert_eq!(1, regions.len()); - assert_eq!(GuestAddress(super::layout::DRAM_MEM_START), regions[0].0); - assert_eq!(super::layout::DRAM_MEM_MAX_SIZE, regions[0].1); + let regions = arch_memory_regions(1usize << 41); + assert_eq!(2, regions.len()); + assert_eq!(GuestAddress(DRAM_MEM_START), regions[0].0); + assert_eq!(MMIO64_MEM_START - DRAM_MEM_START, regions[0].1 as u64); + assert_eq!(GuestAddress(FIRST_ADDR_PAST_64BITS_MMIO), regions[1].0); + assert_eq!( + DRAM_MEM_MAX_SIZE as u64 - MMIO64_MEM_START + DRAM_MEM_START, + regions[1].1 as u64 + ); } #[test] fn test_get_fdt_addr() { - let mem = arch_mem(layout::FDT_MAX_SIZE - 0x1000); - assert_eq!(get_fdt_addr(&mem), layout::DRAM_MEM_START); + let mem = arch_mem(FDT_MAX_SIZE - 0x1000); + assert_eq!(get_fdt_addr(&mem), DRAM_MEM_START); - let mem = arch_mem(layout::FDT_MAX_SIZE); - assert_eq!(get_fdt_addr(&mem), layout::DRAM_MEM_START); + let mem = arch_mem(FDT_MAX_SIZE); + assert_eq!(get_fdt_addr(&mem), DRAM_MEM_START); - let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); - assert_eq!(get_fdt_addr(&mem), 0x1000 + layout::DRAM_MEM_START); + let mem = arch_mem(FDT_MAX_SIZE + 0x1000); + assert_eq!(get_fdt_addr(&mem), 0x1000 + DRAM_MEM_START); } } diff --git a/src/vmm/src/arch/mod.rs b/src/vmm/src/arch/mod.rs index 61d65fea1a5..9ed6d1a4190 100644 --- a/src/vmm/src/arch/mod.rs +++ b/src/vmm/src/arch/mod.rs @@ -20,10 +20,13 @@ pub use aarch64::vcpu::*; pub use aarch64::vm::{ArchVm, ArchVmError, VmState}; #[cfg(target_arch = "aarch64")] pub use aarch64::{ - ConfigurationError, MMIO_MEM_SIZE, MMIO_MEM_START, arch_memory_regions, - configure_system_for_boot, get_kernel_start, initrd_load_addr, layout::CMDLINE_MAX_SIZE, - layout::IRQ_BASE, layout::IRQ_MAX, layout::SYSTEM_MEM_SIZE, layout::SYSTEM_MEM_START, - load_kernel, + ConfigurationError, arch_memory_regions, configure_system_for_boot, get_kernel_start, + initrd_load_addr, layout::BOOT_DEVICE_MEM_START, layout::CMDLINE_MAX_SIZE, layout::IRQ_BASE, + layout::IRQ_MAX, layout::MEM_32BIT_DEVICES_SIZE, layout::MEM_32BIT_DEVICES_START, + layout::MEM_64BIT_DEVICES_SIZE, layout::MEM_64BIT_DEVICES_START, layout::MMIO32_MEM_SIZE, + layout::MMIO32_MEM_START, layout::PCI_MMCONFIG_SIZE, layout::PCI_MMCONFIG_START, + layout::PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, layout::RTC_MEM_START, layout::SERIAL_MEM_START, + layout::SYSTEM_MEM_SIZE, layout::SYSTEM_MEM_START, load_kernel, }; /// Module for x86_64 related functionality. @@ -39,9 +42,11 @@ pub use x86_64::vm::{ArchVm, ArchVmError, VmState}; #[cfg(target_arch = "x86_64")] pub use crate::arch::x86_64::{ - ConfigurationError, MMIO_MEM_SIZE, MMIO_MEM_START, arch_memory_regions, - configure_system_for_boot, get_kernel_start, initrd_load_addr, layout::APIC_ADDR, - layout::CMDLINE_MAX_SIZE, layout::IOAPIC_ADDR, layout::IRQ_BASE, layout::IRQ_MAX, + ConfigurationError, arch_memory_regions, configure_system_for_boot, get_kernel_start, + initrd_load_addr, layout::APIC_ADDR, layout::BOOT_DEVICE_MEM_START, layout::CMDLINE_MAX_SIZE, + layout::IOAPIC_ADDR, layout::IRQ_BASE, layout::IRQ_MAX, layout::MEM_32BIT_DEVICES_SIZE, + layout::MEM_32BIT_DEVICES_START, layout::MEM_64BIT_DEVICES_SIZE, + layout::MEM_64BIT_DEVICES_START, layout::MMIO32_MEM_SIZE, layout::MMIO32_MEM_START, layout::SYSTEM_MEM_SIZE, layout::SYSTEM_MEM_START, load_kernel, }; @@ -114,3 +119,32 @@ pub struct EntryPoint { /// Specifies which boot protocol to use pub protocol: BootProtocol, } + +/// Adds in [`regions`] the valid memory regions suitable for RAM taking into account a gap in the +/// available address space and returns the remaining region (if any) past this gap +fn arch_memory_regions_with_gap( + regions: &mut Vec<(GuestAddress, usize)>, + region_start: usize, + region_size: usize, + gap_start: usize, + gap_size: usize, +) -> Option<(usize, usize)> { + // 0-sized gaps don't really make sense. We should never receive such a gap. + assert!(gap_size > 0); + + let first_addr_past_gap = gap_start + gap_size; + match (region_start + region_size).checked_sub(gap_start) { + // case0: region fits all before gap + None | Some(0) => { + regions.push((GuestAddress(region_start as u64), region_size)); + None + } + // case1: region starts before the gap and goes past it + Some(remaining) if region_start < gap_start => { + regions.push((GuestAddress(region_start as u64), gap_start - region_start)); + Some((first_addr_past_gap, remaining)) + } + // case2: region starts past the gap + Some(_) => Some((first_addr_past_gap.max(region_start), region_size)), + } +} diff --git a/src/vmm/src/arch/x86_64/layout.rs b/src/vmm/src/arch/x86_64/layout.rs index 18d718a49b8..8ae558e91c3 100644 --- a/src/vmm/src/arch/x86_64/layout.rs +++ b/src/vmm/src/arch/x86_64/layout.rs @@ -7,6 +7,9 @@ //! Magic addresses externally used to lay out x86_64 VMs. +use crate::device_manager::mmio::MMIO_LEN; +use crate::utils::mib_to_bytes; + /// Initial stack for the boot CPU. pub const BOOT_STACK_POINTER: u64 = 0x8ff0; @@ -77,3 +80,45 @@ pub const SYSTEM_MEM_START: u64 = 0x9fc00; /// 257KiB is more than we need, however we reserve this space for potential future use of /// ACPI features (new tables and/or devices). pub const SYSTEM_MEM_SIZE: u64 = RSDP_ADDR - SYSTEM_MEM_START; + +/// First address that cannot be addressed using 32 bit anymore. +pub const FIRST_ADDR_PAST_32BITS: u64 = 1 << 32; + +/// The size of the memory area reserved for MMIO 32-bit accesses. +pub const MMIO32_MEM_SIZE: u64 = mib_to_bytes(1024) as u64; +/// The start of the memory area reserved for MMIO 32-bit accesses. +pub const MMIO32_MEM_START: u64 = FIRST_ADDR_PAST_32BITS - MMIO32_MEM_SIZE; + +// We dedicate the last 256 MiB of the 32-bit MMIO address space PCIe for memory-mapped access to +// configuration. +/// Size of MMIO region for PCIe configuration accesses. +pub const PCI_MMCONFIG_SIZE: u64 = 256 << 20; +/// Start of MMIO region for PCIe configuration accesses. +pub const PCI_MMCONFIG_START: u64 = IOAPIC_ADDR as u64 - PCI_MMCONFIG_SIZE; +/// MMIO space per PCIe segment +pub const PCI_MMIO_CONFIG_SIZE_PER_SEGMENT: u64 = 4096 * 256; + +// We reserve 768 MiB for devices at the beginning of the MMIO region. This includes space both for +// pure MMIO and PCIe devices. + +/// Memory region start for boot device. +pub const BOOT_DEVICE_MEM_START: u64 = MMIO32_MEM_START; + +/// Beginning of memory region for device MMIO 32-bit accesses +pub const MEM_32BIT_DEVICES_START: u64 = BOOT_DEVICE_MEM_START + MMIO_LEN; +/// Size of memory region for device MMIO 32-bit accesses +pub const MEM_32BIT_DEVICES_SIZE: u64 = PCI_MMCONFIG_START - MEM_32BIT_DEVICES_START; + +// 64-bits region for MMIO accesses +/// The start of the memory area reserved for MMIO 64-bit accesses. +pub const MMIO64_MEM_START: u64 = 256 << 30; +/// The size of the memory area reserved for MMIO 64-bit accesses. +pub const MMIO64_MEM_SIZE: u64 = 256 << 30; + +// At the moment, all of this region goes to devices +/// Beginning of memory region for device MMIO 64-bit accesses +pub const MEM_64BIT_DEVICES_START: u64 = MMIO64_MEM_START; +/// Size of memory region for device MMIO 32-bit accesses +pub const MEM_64BIT_DEVICES_SIZE: u64 = MMIO64_MEM_SIZE; +/// First address past the 64-bit MMIO gap +pub const FIRST_ADDR_PAST_64BITS_MMIO: u64 = MMIO64_MEM_START + MMIO64_MEM_SIZE; diff --git a/src/vmm/src/arch/x86_64/mod.rs b/src/vmm/src/arch/x86_64/mod.rs index c54ec46c987..add5bd52dd7 100644 --- a/src/vmm/src/arch/x86_64/mod.rs +++ b/src/vmm/src/arch/x86_64/mod.rs @@ -33,7 +33,10 @@ pub mod generated; use std::fs::File; -use layout::CMDLINE_START; +use layout::{ + CMDLINE_START, FIRST_ADDR_PAST_32BITS, FIRST_ADDR_PAST_64BITS_MMIO, MMIO32_MEM_SIZE, + MMIO32_MEM_START, MMIO64_MEM_SIZE, MMIO64_MEM_START, PCI_MMCONFIG_SIZE, PCI_MMCONFIG_START, +}; use linux_loader::configurator::linux::LinuxBootConfigurator; use linux_loader::configurator::pvh::PvhBootConfigurator; use linux_loader::configurator::{BootConfigurator, BootParams}; @@ -47,17 +50,17 @@ use log::debug; use super::EntryPoint; use crate::acpi::create_acpi_tables; -use crate::arch::{BootProtocol, SYSTEM_MEM_SIZE, SYSTEM_MEM_START}; +use crate::arch::{BootProtocol, SYSTEM_MEM_SIZE, SYSTEM_MEM_START, arch_memory_regions_with_gap}; use crate::cpu_config::templates::{CustomCpuTemplate, GuestConfigError}; use crate::cpu_config::x86_64::CpuConfiguration; use crate::initrd::InitrdConfig; -use crate::utils::{align_down, mib_to_bytes, u64_to_usize, usize_to_u64}; +use crate::utils::{align_down, u64_to_usize, usize_to_u64}; use crate::vmm_config::machine_config::MachineConfig; use crate::vstate::memory::{ Address, GuestAddress, GuestMemory, GuestMemoryMmap, GuestMemoryRegion, }; use crate::vstate::vcpu::KvmVcpuConfigureError; -use crate::{Vcpu, VcpuConfig, Vmm}; +use crate::{Vcpu, VcpuConfig, Vmm, logger}; // Value taken from https://elixir.bootlin.com/linux/v5.10.68/source/arch/x86/include/uapi/asm/e820.h#L31 // Usable normal RAM @@ -96,48 +99,53 @@ pub enum ConfigurationError { Acpi(#[from] crate::acpi::AcpiError), } -/// First address that cannot be addressed using 32 bit anymore. -pub const FIRST_ADDR_PAST_32BITS: u64 = 1 << 32; - -/// Size of MMIO gap at top of 32-bit address space. -pub const MEM_32BIT_GAP_SIZE: u64 = mib_to_bytes(768) as u64; -/// The start of the memory area reserved for MMIO devices. -pub const MMIO_MEM_START: u64 = FIRST_ADDR_PAST_32BITS - MEM_32BIT_GAP_SIZE; -/// The size of the memory area reserved for MMIO devices. -pub const MMIO_MEM_SIZE: u64 = MEM_32BIT_GAP_SIZE; - /// Returns a Vec of the valid memory addresses. /// These should be used to configure the GuestMemoryMmap structure for the platform. -/// For x86_64 all addresses are valid from the start of the kernel except a -/// carve out at the end of 32bit address space. -pub fn arch_memory_regions(offset: usize, size: usize) -> Vec<(GuestAddress, usize)> { +/// For x86_64 all addresses are valid from the start of the kernel except an 1GB +/// carve out at the end of 32bit address space and a second 256GB one at the 256GB limit. +pub fn arch_memory_regions(size: usize) -> Vec<(GuestAddress, usize)> { // If we get here with size == 0 something has seriously gone wrong. Firecracker should never // try to allocate guest memory of size 0 assert!(size > 0, "Attempt to allocate guest memory of length 0"); - assert!( - offset.checked_add(size).is_some(), - "Attempt to allocate guest memory such that the address space would wrap around" + + let dram_size = std::cmp::min( + usize::MAX - u64_to_usize(MMIO32_MEM_SIZE) - u64_to_usize(MMIO64_MEM_SIZE), + size, ); - // It's safe to cast MMIO_MEM_START to usize because it fits in a u32 variable - // (It points to an address in the 32 bit space). - match (size + offset).checked_sub(u64_to_usize(MMIO_MEM_START)) { - // case1: guest memory fits before the gap - None | Some(0) => vec![(GuestAddress(offset as u64), size)], - // case2: starts before the gap, but doesn't completely fit - Some(remaining) if (offset as u64) < MMIO_MEM_START => vec![ - ( - GuestAddress(offset as u64), - u64_to_usize(MMIO_MEM_START) - offset, - ), - (GuestAddress(FIRST_ADDR_PAST_32BITS), remaining), - ], - // case3: guest memory start after the gap - Some(_) => vec![( - GuestAddress(FIRST_ADDR_PAST_32BITS.max(offset as u64)), + if dram_size != size { + logger::warn!( + "Requested memory size {} exceeds architectural maximum (1022GiB). Size has been \ + truncated to {}", size, - )], + dram_size + ); } + + let mut regions = vec![]; + + if let Some((start_past_32bit_gap, remaining_past_32bit_gap)) = arch_memory_regions_with_gap( + &mut regions, + 0, + dram_size, + u64_to_usize(MMIO32_MEM_START), + u64_to_usize(MMIO32_MEM_SIZE), + ) { + if let Some((start_past_64bit_gap, remaining_past_64bit_gap)) = arch_memory_regions_with_gap( + &mut regions, + start_past_32bit_gap, + remaining_past_32bit_gap, + u64_to_usize(MMIO64_MEM_START), + u64_to_usize(MMIO64_MEM_SIZE), + ) { + regions.push(( + GuestAddress(start_past_64bit_gap as u64), + remaining_past_64bit_gap, + )); + } + } + + regions } /// Returns the memory address where the kernel could be loaded. @@ -237,7 +245,9 @@ fn configure_pvh( ) -> Result<(), ConfigurationError> { const XEN_HVM_START_MAGIC_VALUE: u32 = 0x336e_c578; let first_addr_past_32bits = GuestAddress(FIRST_ADDR_PAST_32BITS); - let end_32bit_gap_start = GuestAddress(MMIO_MEM_START); + let end_32bit_gap_start = GuestAddress(MMIO32_MEM_START); + let first_addr_past_64bits = GuestAddress(FIRST_ADDR_PAST_64BITS_MMIO); + let end_64bit_gap_start = GuestAddress(MMIO64_MEM_START); let himem_start = GuestAddress(layout::HIMEM_START); // Vector to hold modules (currently either empty or holding initrd). @@ -269,32 +279,42 @@ fn configure_pvh( type_: E820_RESERVED, ..Default::default() }); + memmap.push(hvm_memmap_table_entry { + addr: PCI_MMCONFIG_START, + size: PCI_MMCONFIG_SIZE, + type_: E820_RESERVED, + ..Default::default() + }); let last_addr = guest_mem.last_addr(); - if last_addr < end_32bit_gap_start { + + if last_addr > first_addr_past_64bits { memmap.push(hvm_memmap_table_entry { - addr: himem_start.raw_value(), - size: last_addr.unchecked_offset_from(himem_start) + 1, + addr: first_addr_past_64bits.raw_value(), + size: last_addr.unchecked_offset_from(first_addr_past_64bits) + 1, type_: MEMMAP_TYPE_RAM, ..Default::default() }); - } else { + } + + if last_addr > first_addr_past_32bits { memmap.push(hvm_memmap_table_entry { - addr: himem_start.raw_value(), - size: end_32bit_gap_start.unchecked_offset_from(himem_start), + addr: first_addr_past_32bits.raw_value(), + size: (end_64bit_gap_start.unchecked_offset_from(first_addr_past_32bits)) + .min(last_addr.unchecked_offset_from(first_addr_past_32bits) + 1), type_: MEMMAP_TYPE_RAM, ..Default::default() }); - - if last_addr > first_addr_past_32bits { - memmap.push(hvm_memmap_table_entry { - addr: first_addr_past_32bits.raw_value(), - size: last_addr.unchecked_offset_from(first_addr_past_32bits) + 1, - type_: MEMMAP_TYPE_RAM, - ..Default::default() - }); - } } + memmap.push(hvm_memmap_table_entry { + addr: himem_start.raw_value(), + size: end_32bit_gap_start + .unchecked_offset_from(himem_start) + .min(last_addr.unchecked_offset_from(himem_start) + 1), + type_: MEMMAP_TYPE_RAM, + ..Default::default() + }); + // Construct the hvm_start_info structure and serialize it into // boot_params. This will be stored at PVH_INFO_START address, and %rbx // will be initialized to contain PVH_INFO_START prior to starting the @@ -340,7 +360,9 @@ fn configure_64bit_boot( const KERNEL_LOADER_OTHER: u8 = 0xff; const KERNEL_MIN_ALIGNMENT_BYTES: u32 = 0x0100_0000; // Must be non-zero. let first_addr_past_32bits = GuestAddress(FIRST_ADDR_PAST_32BITS); - let end_32bit_gap_start = GuestAddress(MMIO_MEM_START); + let end_32bit_gap_start = GuestAddress(MMIO32_MEM_START); + let first_addr_past_64bits = GuestAddress(FIRST_ADDR_PAST_64BITS_MMIO); + let end_64bit_gap_start = GuestAddress(MMIO64_MEM_START); let himem_start = GuestAddress(layout::HIMEM_START); @@ -371,39 +393,42 @@ fn configure_64bit_boot( layout::SYSTEM_MEM_SIZE, E820_RESERVED, )?; + add_e820_entry( + &mut params, + PCI_MMCONFIG_START, + PCI_MMCONFIG_SIZE, + E820_RESERVED, + )?; let last_addr = guest_mem.last_addr(); - if last_addr < end_32bit_gap_start { + + if last_addr > first_addr_past_64bits { add_e820_entry( &mut params, - himem_start.raw_value(), - // it's safe to use unchecked_offset_from because - // mem_end > himem_start - last_addr.unchecked_offset_from(himem_start) + 1, + first_addr_past_64bits.raw_value(), + last_addr.unchecked_offset_from(first_addr_past_64bits) + 1, E820_RAM, )?; - } else { + } + + if last_addr > first_addr_past_32bits { add_e820_entry( &mut params, - himem_start.raw_value(), - // it's safe to use unchecked_offset_from because - // end_32bit_gap_start > himem_start - end_32bit_gap_start.unchecked_offset_from(himem_start), + first_addr_past_32bits.raw_value(), + (end_64bit_gap_start.unchecked_offset_from(first_addr_past_32bits)) + .min(last_addr.unchecked_offset_from(first_addr_past_32bits) + 1), E820_RAM, )?; - - if last_addr > first_addr_past_32bits { - add_e820_entry( - &mut params, - first_addr_past_32bits.raw_value(), - // it's safe to use unchecked_offset_from because - // mem_end > first_addr_past_32bits - last_addr.unchecked_offset_from(first_addr_past_32bits) + 1, - E820_RAM, - )?; - } } + add_e820_entry( + &mut params, + himem_start.raw_value(), + (last_addr.unchecked_offset_from(himem_start) + 1) + .min(end_32bit_gap_start.unchecked_offset_from(himem_start)), + E820_RAM, + )?; + LinuxBootConfigurator::write_bootparams( &BootParams::new(¶ms, GuestAddress(layout::ZERO_PAGE_START)), guest_mem, @@ -468,51 +493,69 @@ pub fn load_kernel( #[cfg(kani)] mod verification { - use crate::arch::x86_64::FIRST_ADDR_PAST_32BITS; - use crate::arch::{MMIO_MEM_START, arch_memory_regions}; + + use crate::arch::arch_memory_regions; + use crate::arch::x86_64::layout::{ + FIRST_ADDR_PAST_32BITS, FIRST_ADDR_PAST_64BITS_MMIO, MMIO32_MEM_SIZE, MMIO32_MEM_START, + MMIO64_MEM_SIZE, MMIO64_MEM_START, + }; + use crate::utils::u64_to_usize; #[kani::proof] - #[kani::unwind(3)] + #[kani::unwind(4)] fn verify_arch_memory_regions() { - let offset: u64 = kani::any::(); let len: u64 = kani::any::(); kani::assume(len > 0); - kani::assume(offset.checked_add(len).is_some()); - let regions = arch_memory_regions(offset as usize, len as usize); + let regions = arch_memory_regions(len as usize); - // There's only one MMIO gap, so we can get either 1 or 2 regions - assert!(regions.len() <= 2); + // There are two MMIO gaps, so we can get either 1, 2 or 3 regions + assert!(regions.len() <= 3); assert!(regions.len() >= 1); + // The first address is always 0 + assert_eq!(regions[0].0.0, 0); + // The total length of all regions is what we requested - assert_eq!( - regions.iter().map(|&(_, len)| len).sum::(), - len as usize - ); + let actual_size = regions.iter().map(|&(_, len)| len).sum::(); + assert!(actual_size <= len as usize); + if actual_size < u64_to_usize(len) { + assert_eq!( + actual_size, + usize::MAX - u64_to_usize(MMIO32_MEM_SIZE) - u64_to_usize(MMIO64_MEM_SIZE) + ); + } // No region overlaps the MMIO gap assert!( regions .iter() - .all(|&(start, len)| start.0 >= FIRST_ADDR_PAST_32BITS - || start.0 + len as u64 <= MMIO_MEM_START) + .all(|&(start, len)| (start.0 >= FIRST_ADDR_PAST_32BITS + || start.0 + len as u64 <= MMIO32_MEM_START) + && (start.0 >= FIRST_ADDR_PAST_64BITS_MMIO + || start.0 + len as u64 <= MMIO64_MEM_START)) ); - // All regions start after our specified offset - assert!(regions.iter().all(|&(start, _)| start.0 >= offset as u64)); - // All regions have non-zero length assert!(regions.iter().all(|&(_, len)| len > 0)); - // If there's two regions, they perfectly snuggle up to the MMIO gap - if regions.len() == 2 { + // If there's at least two regions, they perfectly snuggle up to one of the two MMIO gaps + if regions.len() >= 2 { kani::cover!(); - assert_eq!(regions[0].0.0 + regions[0].1 as u64, MMIO_MEM_START); + assert_eq!(regions[0].0.0 + regions[0].1 as u64, MMIO32_MEM_START); assert_eq!(regions[1].0.0, FIRST_ADDR_PAST_32BITS); } + + // If there are three regions, the last two perfectly snuggle up to the 64bit + // MMIO gap + if regions.len() == 3 { + kani::cover!(); + + assert_eq!(regions[1].0.0 + regions[1].1 as u64, MMIO64_MEM_START); + assert_eq!(regions[2].0.0, FIRST_ADDR_PAST_64BITS_MMIO); + } } } @@ -523,37 +566,25 @@ mod tests { use super::*; use crate::device_manager::resources::ResourceAllocator; use crate::test_utils::{arch_mem, single_region_mem}; + use crate::utils::mib_to_bytes; #[test] fn regions_lt_4gb() { - let regions = arch_memory_regions(0, 1usize << 29); + let regions = arch_memory_regions(1usize << 29); assert_eq!(1, regions.len()); assert_eq!(GuestAddress(0), regions[0].0); assert_eq!(1usize << 29, regions[0].1); - - let regions = arch_memory_regions(1 << 28, 1 << 29); - assert_eq!(1, regions.len()); - assert_eq!(regions[0], (GuestAddress(1 << 28), 1 << 29)); } #[test] fn regions_gt_4gb() { const MEMORY_SIZE: usize = (1 << 32) + 0x8000; - let regions = arch_memory_regions(0, MEMORY_SIZE); + let regions = arch_memory_regions(MEMORY_SIZE); assert_eq!(2, regions.len()); assert_eq!(GuestAddress(0), regions[0].0); assert_eq!(GuestAddress(1u64 << 32), regions[1].0); - let regions = arch_memory_regions(1 << 31, MEMORY_SIZE); - assert_eq!(2, regions.len()); - assert_eq!( - regions[0], - ( - GuestAddress(1 << 31), - u64_to_usize(MMIO_MEM_START) - (1 << 31) - ) - ); assert_eq!( regions[1], ( diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 48590201f2d..171eca7f95e 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -1038,8 +1038,8 @@ pub(crate) mod tests { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] assert!(cmdline_contains( &cmdline, - "virtio_mmio.device=4K@0xd0000000:5 virtio_mmio.device=4K@0xd0001000:6 \ - virtio_mmio.device=4K@0xd0002000:7" + "virtio_mmio.device=4K@0xc0001000:5 virtio_mmio.device=4K@0xc0002000:6 \ + virtio_mmio.device=4K@0xc0003000:7" )); } @@ -1137,7 +1137,7 @@ pub(crate) mod tests { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] assert!(cmdline_contains( &cmdline, - "virtio_mmio.device=4K@0xd0000000:5" + "virtio_mmio.device=4K@0xc0001000:5" )); } @@ -1154,7 +1154,7 @@ pub(crate) mod tests { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] assert!(cmdline_contains( &cmdline, - "virtio_mmio.device=4K@0xd0000000:5" + "virtio_mmio.device=4K@0xc0001000:5" )); } @@ -1173,7 +1173,7 @@ pub(crate) mod tests { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] assert!(cmdline_contains( &cmdline, - "virtio_mmio.device=4K@0xd0000000:5" + "virtio_mmio.device=4K@0xc0001000:5" )); } } diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 09c026df610..70449a64ffd 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -21,6 +21,9 @@ use serde::{Deserialize, Serialize}; use vm_allocator::AllocPolicy; use super::resources::ResourceAllocator; +use crate::arch::BOOT_DEVICE_MEM_START; +#[cfg(target_arch = "aarch64")] +use crate::arch::{RTC_MEM_START, SERIAL_MEM_START}; #[cfg(target_arch = "aarch64")] use crate::devices::legacy::{RTCDevice, SerialDevice}; use crate::devices::pseudo::BootTimer; @@ -176,7 +179,7 @@ impl MMIODeviceManager { }; let device_info = MMIODeviceInfo { - addr: resource_allocator.allocate_mmio_memory( + addr: resource_allocator.allocate_32bit_mmio_memory( MMIO_LEN, MMIO_LEN, AllocPolicy::FirstMatch, @@ -293,7 +296,12 @@ impl MMIODeviceManager { let device_info = if let Some(device_info) = device_info_opt { device_info } else { - self.allocate_mmio_resources(resource_allocator, 1)? + let gsi = resource_allocator.allocate_gsi(1)?; + MMIODeviceInfo { + addr: SERIAL_MEM_START, + len: MMIO_LEN, + irq: NonZeroU32::new(gsi[0]), + } }; vm.register_irqfd( @@ -348,7 +356,12 @@ impl MMIODeviceManager { let device_info = if let Some(device_info) = device_info_opt { device_info } else { - self.allocate_mmio_resources(resource_allocator, 1)? + let gsi = resource_allocator.allocate_gsi(1)?; + MMIODeviceInfo { + addr: RTC_MEM_START, + len: MMIO_LEN, + irq: NonZeroU32::new(gsi[0]), + } }; let device = MMIODevice { @@ -369,11 +382,15 @@ impl MMIODeviceManager { pub fn register_mmio_boot_timer( &mut self, mmio_bus: &vm_device::Bus, - resource_allocator: &mut ResourceAllocator, boot_timer: Arc>, ) -> Result<(), MmioError> { // Attach a new boot timer device. - let device_info = self.allocate_mmio_resources(resource_allocator, 0)?; + let device_info = MMIODeviceInfo { + addr: BOOT_DEVICE_MEM_START, + len: MMIO_LEN, + irq: None, + }; + let device = MMIODevice { resources: device_info, inner: boot_timer, @@ -698,7 +715,7 @@ pub(crate) mod tests { assert!(device_manager.get_virtio_device(0, "foo").is_none()); let dev = device_manager.get_virtio_device(0, "dummy").unwrap(); - assert_eq!(dev.resources.addr, arch::MMIO_MEM_START); + assert_eq!(dev.resources.addr, arch::MEM_32BIT_DEVICES_START); assert_eq!(dev.resources.len, MMIO_LEN); assert_eq!( dev.resources.irq, @@ -709,7 +726,7 @@ pub(crate) mod tests { .for_each_virtio_device(|virtio_type, device_id, mmio_device| { assert_eq!(*virtio_type, 0); assert_eq!(device_id, "dummy"); - assert_eq!(mmio_device.resources.addr, arch::MMIO_MEM_START); + assert_eq!(mmio_device.resources.addr, arch::MEM_32BIT_DEVICES_START); assert_eq!(mmio_device.resources.len, MMIO_LEN); assert_eq!( mmio_device.resources.irq, diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index 3e3f0f0ffda..f6720233fd1 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -211,11 +211,8 @@ impl DeviceManager { ) -> Result<(), AttachMmioDeviceError> { let boot_timer = Arc::new(Mutex::new(BootTimer::new(request_ts))); - self.mmio_devices.register_mmio_boot_timer( - &self.mmio_bus, - &mut self.resource_allocator, - boot_timer, - )?; + self.mmio_devices + .register_mmio_boot_timer(&self.mmio_bus, boot_timer)?; Ok(()) } diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 99216ec77e7..5d0bd8051e9 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -433,17 +433,6 @@ impl<'a> Persist<'a> for MMIODeviceManager { .event_manager .add_subscriber(serial.clone()); - constructor_args - .resource_allocator - .allocate_mmio_memory( - MMIO_LEN, - MMIO_LEN, - AllocPolicy::ExactMatch(state.device_info.addr), - ) - .map_err(|e| { - DevicePersistError::DeviceManager(super::mmio::MmioError::Allocator(e)) - })?; - dev_manager.register_mmio_serial( vm, constructor_args.mmio_bus, @@ -454,16 +443,6 @@ impl<'a> Persist<'a> for MMIODeviceManager { } if state.type_ == DeviceType::Rtc { let rtc = Arc::new(Mutex::new(RTCDevice::new())); - constructor_args - .resource_allocator - .allocate_mmio_memory( - MMIO_LEN, - MMIO_LEN, - AllocPolicy::ExactMatch(state.device_info.addr), - ) - .map_err(|e| { - DevicePersistError::DeviceManager(super::mmio::MmioError::Allocator(e)) - })?; dev_manager.register_mmio_rtc( constructor_args.mmio_bus, constructor_args.resource_allocator, @@ -507,7 +486,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { constructor_args .resource_allocator - .allocate_mmio_memory( + .allocate_32bit_mmio_memory( MMIO_LEN, MMIO_LEN, AllocPolicy::ExactMatch(device_info.addr), diff --git a/src/vmm/src/device_manager/resources.rs b/src/vmm/src/device_manager/resources.rs index 821148794ec..65087b6fa16 100644 --- a/src/vmm/src/device_manager/resources.rs +++ b/src/vmm/src/device_manager/resources.rs @@ -17,8 +17,10 @@ use crate::arch; pub struct ResourceAllocator { // Allocator for device interrupt lines gsi_allocator: IdAllocator, - // Allocator for memory in the MMIO address space - mmio_memory: AddressAllocator, + // Allocator for memory in the 32-bit MMIO address space + mmio32_memory: AddressAllocator, + // Allocator for memory in the 64-bit MMIO address space + mmio64_memory: AddressAllocator, // Memory allocator for system data system_memory: AddressAllocator, } @@ -28,7 +30,14 @@ impl ResourceAllocator { pub fn new() -> Result { Ok(Self { gsi_allocator: IdAllocator::new(arch::IRQ_BASE, arch::IRQ_MAX)?, - mmio_memory: AddressAllocator::new(arch::MMIO_MEM_START, arch::MMIO_MEM_SIZE)?, + mmio32_memory: AddressAllocator::new( + arch::MEM_32BIT_DEVICES_START, + arch::MEM_32BIT_DEVICES_SIZE, + )?, + mmio64_memory: AddressAllocator::new( + arch::MEM_64BIT_DEVICES_START, + arch::MEM_64BIT_DEVICES_SIZE, + )?, system_memory: AddressAllocator::new(arch::SYSTEM_MEM_START, arch::SYSTEM_MEM_SIZE)?, }) } @@ -57,7 +66,7 @@ impl ResourceAllocator { Ok(gsis) } - /// Allocate a memory range in MMIO address space + /// Allocate a memory range in 32-bit MMIO address space /// /// If it succeeds, it returns the first address of the allocated range /// @@ -66,13 +75,37 @@ impl ResourceAllocator { /// * `size` - The size in bytes of the memory to allocate /// * `alignment` - The alignment of the address of the first byte /// * `policy` - A [`vm_allocator::AllocPolicy`] variant for determining the allocation policy - pub fn allocate_mmio_memory( + pub fn allocate_32bit_mmio_memory( &mut self, size: u64, alignment: u64, policy: AllocPolicy, ) -> Result { - Ok(self.mmio_memory.allocate(size, alignment, policy)?.start()) + Ok(self + .mmio32_memory + .allocate(size, alignment, policy)? + .start()) + } + + /// Allocate a memory range in 64-bit MMIO address space + /// + /// If it succeeds, it returns the first address of the allocated range + /// + /// # Arguments + /// + /// * `size` - The size in bytes of the memory to allocate + /// * `alignment` - The alignment of the address of the first byte + /// * `policy` - A [`vm_allocator::AllocPolicy`] variant for determining the allocation policy + pub fn allocate_64bit_mmio_memory( + &mut self, + size: u64, + alignment: u64, + policy: AllocPolicy, + ) -> Result { + Ok(self + .mmio64_memory + .allocate(size, alignment, policy)? + .start()) } /// Allocate a memory range for system data diff --git a/src/vmm/src/devices/virtio/vsock/event_handler.rs b/src/vmm/src/devices/virtio/vsock/event_handler.rs index a54998ba808..b4445e298ae 100755 --- a/src/vmm/src/devices/virtio/vsock/event_handler.rs +++ b/src/vmm/src/devices/virtio/vsock/event_handler.rs @@ -481,8 +481,8 @@ mod tests { #[cfg(target_arch = "x86_64")] #[allow(clippy::cast_possible_truncation)] /* casting of constants we know fit into u32 */ fn test_vsock_bof() { - use crate::arch::MMIO_MEM_START; - use crate::arch::x86_64::{FIRST_ADDR_PAST_32BITS, MEM_32BIT_GAP_SIZE}; + use crate::arch::x86_64::layout::FIRST_ADDR_PAST_32BITS; + use crate::arch::{MMIO32_MEM_SIZE, MMIO32_MEM_START}; use crate::devices::virtio::vsock::packet::VSOCK_PKT_HDR_SIZE; use crate::test_utils::multi_region_mem; use crate::utils::mib_to_bytes; @@ -493,7 +493,7 @@ mod tests { let mut test_ctx = TestContext::new(); test_ctx.mem = multi_region_mem(&[ (GuestAddress(0), 8 * MIB), - (GuestAddress(MMIO_MEM_START - MIB as u64), MIB), + (GuestAddress(MMIO32_MEM_START - MIB as u64), MIB), (GuestAddress(FIRST_ADDR_PAST_32BITS), MIB), ]); @@ -516,15 +516,15 @@ mod tests { } // Let's check what happens when the header descriptor is right before the gap. - vsock_bof_helper(&mut test_ctx, 0, MMIO_MEM_START - 1, VSOCK_PKT_HDR_SIZE); + vsock_bof_helper(&mut test_ctx, 0, MMIO32_MEM_START - 1, VSOCK_PKT_HDR_SIZE); // Let's check what happens when the buffer descriptor crosses into the gap, but does // not go past its right edge. vsock_bof_helper( &mut test_ctx, 1, - MMIO_MEM_START - 4, - MEM_32BIT_GAP_SIZE as u32 + 4, + MMIO32_MEM_START - 4, + MMIO32_MEM_SIZE as u32 + 4, ); // Let's modify the buffer descriptor addr and len such that it crosses over the MMIO gap, @@ -532,8 +532,8 @@ mod tests { vsock_bof_helper( &mut test_ctx, 1, - MMIO_MEM_START - 4, - MEM_32BIT_GAP_SIZE as u32 + 100, + MMIO32_MEM_START - 4, + MMIO32_MEM_SIZE as u32 + 100, ); } diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index 70c317bb1e1..a57df4341da 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -472,7 +472,7 @@ impl VmResources { // a single way of backing guest memory for vhost-user and non-vhost-user cases, // that would not be worth the effort. let regions = - crate::arch::arch_memory_regions(0, mib_to_bytes(self.machine_config.mem_size_mib)); + crate::arch::arch_memory_regions(mib_to_bytes(self.machine_config.mem_size_mib)); if vhost_user_device_used { memory::memfd_backed( regions.as_ref(), diff --git a/src/vmm/src/test_utils/mod.rs b/src/vmm/src/test_utils/mod.rs index 7cb16a2a213..d3700c98925 100644 --- a/src/vmm/src/test_utils/mod.rs +++ b/src/vmm/src/test_utils/mod.rs @@ -58,11 +58,11 @@ pub fn multi_region_mem_raw(regions: &[(GuestAddress, usize)]) -> Vec GuestMemoryMmap { - multi_region_mem(&crate::arch::arch_memory_regions(0, mem_size_bytes)) + multi_region_mem(&crate::arch::arch_memory_regions(mem_size_bytes)) } pub fn arch_mem_raw(mem_size_bytes: usize) -> Vec { - multi_region_mem_raw(&crate::arch::arch_memory_regions(0, mem_size_bytes)) + multi_region_mem_raw(&crate::arch::arch_memory_regions(mem_size_bytes)) } pub fn create_vmm( From c5fe0d88fffedca2500511bab716fa0a48442f1d Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 6 May 2025 13:00:48 +0200 Subject: [PATCH 16/56] refactor: prepare ResourceAllocator for PCIe devices PCIe devices need some times to relocate themselves in memory. To do so, they need to keep an (atomic) reference to a type that implements `DeviceRelocation` trait. The logic for relocation involves removing the device from the bus it has been registered to, allocate a new address range for it and reinsert it. Instead of creating a new type for it, reuse `ResourceAllocator`. This means that we need to move the buses from the `DeviceManager` inside `ResourceAllocator`. Signed-off-by: Babis Chalios --- Cargo.lock | 6 +++ src/vmm/src/acpi/mod.rs | 39 +++++++------- src/vmm/src/arch/aarch64/fdt.rs | 2 +- src/vmm/src/arch/x86_64/mod.rs | 18 +++---- src/vmm/src/arch/x86_64/mptable.rs | 30 +++++------ src/vmm/src/device_manager/mmio.rs | 49 ++++++++--------- src/vmm/src/device_manager/mod.rs | 56 +++++-------------- src/vmm/src/device_manager/persist.rs | 22 ++++---- src/vmm/src/device_manager/resources.rs | 71 ++++++++++++++++++------- src/vmm/src/devices/acpi/vmgenid.rs | 4 +- src/vmm/src/lib.rs | 4 +- 11 files changed, 152 insertions(+), 149 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c74c2191d0e..33c5b3e88a6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -256,6 +256,12 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "cargo_toml" version = "0.22.1" diff --git a/src/vmm/src/acpi/mod.rs b/src/vmm/src/acpi/mod.rs index 542e53409b7..a9b9b2bfb28 100644 --- a/src/vmm/src/acpi/mod.rs +++ b/src/vmm/src/acpi/mod.rs @@ -53,7 +53,7 @@ impl AcpiTableWriter<'_> { /// buffer. It returns the address in which it wrote the table. fn write_acpi_table( &mut self, - resource_allocator: &mut ResourceAllocator, + resource_allocator: &ResourceAllocator, table: &mut S, ) -> Result where @@ -94,7 +94,7 @@ impl AcpiTableWriter<'_> { setup_arch_dsdt(&mut dsdt_data)?; let mut dsdt = Dsdt::new(OEM_ID, *b"FCVMDSDT", OEM_REVISION, dsdt_data); - self.write_acpi_table(&mut device_manager.resource_allocator, &mut dsdt) + self.write_acpi_table(&device_manager.resource_allocator, &mut dsdt) } /// Build the FADT table for the guest @@ -102,7 +102,7 @@ impl AcpiTableWriter<'_> { /// This includes a pointer with the location of the DSDT in guest memory fn build_fadt( &mut self, - resource_allocator: &mut ResourceAllocator, + resource_allocator: &ResourceAllocator, dsdt_addr: u64, ) -> Result { let mut fadt = Fadt::new(OEM_ID, *b"FCVMFADT", OEM_REVISION); @@ -120,7 +120,7 @@ impl AcpiTableWriter<'_> { /// This includes information about the interrupt controllers supported in the platform fn build_madt( &mut self, - resource_allocator: &mut ResourceAllocator, + resource_allocator: &ResourceAllocator, nr_vcpus: u8, ) -> Result { let mut madt = Madt::new( @@ -138,7 +138,7 @@ impl AcpiTableWriter<'_> { /// Currently, we pass to the guest just FADT and MADT tables. fn build_xsdt( &mut self, - resource_allocator: &mut ResourceAllocator, + resource_allocator: &ResourceAllocator, fadt_addr: u64, madt_addr: u64, ) -> Result { @@ -180,15 +180,14 @@ pub(crate) fn create_acpi_tables( vcpus: &[Vcpu], ) -> Result<(), AcpiError> { let mut writer = AcpiTableWriter { mem }; - let dsdt_addr = writer.build_dsdt(device_manager)?; - let fadt_addr = writer.build_fadt(&mut device_manager.resource_allocator, dsdt_addr)?; + + let fadt_addr = writer.build_fadt(&device_manager.resource_allocator, dsdt_addr)?; let madt_addr = writer.build_madt( - &mut device_manager.resource_allocator, + &device_manager.resource_allocator, vcpus.len().try_into().unwrap(), )?; - let xsdt_addr = - writer.build_xsdt(&mut device_manager.resource_allocator, fadt_addr, madt_addr)?; + let xsdt_addr = writer.build_xsdt(&device_manager.resource_allocator, fadt_addr, madt_addr)?; writer.build_rsdp(xsdt_addr) } @@ -227,7 +226,7 @@ mod tests { #[test] fn test_write_acpi_table_memory_allocation() { // A mocke Vmm object with 128MBs of memory - let mut vmm = default_vmm(); + let vmm = default_vmm(); let mut writer = AcpiTableWriter { mem: vmm.vm.guest_memory(), }; @@ -235,14 +234,14 @@ mod tests { // This should succeed let mut sdt = MockSdt(vec![0; 4096]); let addr = writer - .write_acpi_table(&mut vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START); // Let's try to write two 4K pages plus one byte let mut sdt = MockSdt(vec![0; usize::try_from(SYSTEM_MEM_SIZE + 1).unwrap()]); let err = writer - .write_acpi_table(&mut vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) .unwrap_err(); assert!( matches!( @@ -257,27 +256,27 @@ mod tests { // succeed. let mut sdt = MockSdt(vec![0; 5]); let addr = writer - .write_acpi_table(&mut vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4096); let mut sdt = MockSdt(vec![0; 2]); let addr = writer - .write_acpi_table(&mut vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4101); let mut sdt = MockSdt(vec![0; 4]); let addr = writer - .write_acpi_table(&mut vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4103); let mut sdt = MockSdt(vec![0; 8]); let addr = writer - .write_acpi_table(&mut vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4107); let mut sdt = MockSdt(vec![0; 16]); let addr = writer - .write_acpi_table(&mut vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4115); } @@ -294,11 +293,11 @@ mod tests { let mut writer = AcpiTableWriter { mem: vm.guest_memory(), }; - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); let mut sdt = MockSdt(vec![0; usize::try_from(SYSTEM_MEM_SIZE).unwrap()]); let err = writer - .write_acpi_table(&mut resource_allocator, &mut sdt) + .write_acpi_table(&resource_allocator, &mut sdt) .unwrap_err(); assert!( matches!( diff --git a/src/vmm/src/arch/aarch64/fdt.rs b/src/vmm/src/arch/aarch64/fdt.rs index be53ef6993d..c4f05d33a30 100644 --- a/src/vmm/src/arch/aarch64/fdt.rs +++ b/src/vmm/src/arch/aarch64/fdt.rs @@ -477,7 +477,7 @@ mod tests { .register_virtio_test_device( &vm, mem.clone(), - &mut device_manager.resource_allocator, + &device_manager.resource_allocator, dummy, &mut cmdline, "dummy", diff --git a/src/vmm/src/arch/x86_64/mod.rs b/src/vmm/src/arch/x86_64/mod.rs index add5bd52dd7..fe1296e5d1c 100644 --- a/src/vmm/src/arch/x86_64/mod.rs +++ b/src/vmm/src/arch/x86_64/mod.rs @@ -213,7 +213,7 @@ pub fn configure_system_for_boot( // Note that this puts the mptable at the last 1k of Linux's 640k base RAM mptable::setup_mptable( vmm.vm.guest_memory(), - &mut vmm.device_manager.resource_allocator, + &vmm.device_manager.resource_allocator, vcpu_config.vcpu_count, ) .map_err(ConfigurationError::MpTableSetup)?; @@ -598,8 +598,8 @@ mod tests { fn test_system_configuration() { let no_vcpus = 4; let gm = single_region_mem(0x10000); - let mut resource_allocator = ResourceAllocator::new().unwrap(); - let err = mptable::setup_mptable(&gm, &mut resource_allocator, 1); + let resource_allocator = ResourceAllocator::new().unwrap(); + let err = mptable::setup_mptable(&gm, &resource_allocator, 1); assert!(matches!( err.unwrap_err(), mptable::MptableError::NotEnoughMemory @@ -608,24 +608,24 @@ mod tests { // Now assigning some memory that falls before the 32bit memory hole. let mem_size = mib_to_bytes(128); let gm = arch_mem(mem_size); - let mut resource_allocator = ResourceAllocator::new().unwrap(); - mptable::setup_mptable(&gm, &mut resource_allocator, no_vcpus).unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); + mptable::setup_mptable(&gm, &resource_allocator, no_vcpus).unwrap(); configure_64bit_boot(&gm, GuestAddress(0), 0, &None).unwrap(); configure_pvh(&gm, GuestAddress(0), &None).unwrap(); // Now assigning some memory that is equal to the start of the 32bit memory hole. let mem_size = mib_to_bytes(3328); let gm = arch_mem(mem_size); - let mut resource_allocator = ResourceAllocator::new().unwrap(); - mptable::setup_mptable(&gm, &mut resource_allocator, no_vcpus).unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); + mptable::setup_mptable(&gm, &resource_allocator, no_vcpus).unwrap(); configure_64bit_boot(&gm, GuestAddress(0), 0, &None).unwrap(); configure_pvh(&gm, GuestAddress(0), &None).unwrap(); // Now assigning some memory that falls after the 32bit memory hole. let mem_size = mib_to_bytes(3330); let gm = arch_mem(mem_size); - let mut resource_allocator = ResourceAllocator::new().unwrap(); - mptable::setup_mptable(&gm, &mut resource_allocator, no_vcpus).unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); + mptable::setup_mptable(&gm, &resource_allocator, no_vcpus).unwrap(); configure_64bit_boot(&gm, GuestAddress(0), 0, &None).unwrap(); configure_pvh(&gm, GuestAddress(0), &None).unwrap(); } diff --git a/src/vmm/src/arch/x86_64/mptable.rs b/src/vmm/src/arch/x86_64/mptable.rs index 6646c17e282..c397290c23e 100644 --- a/src/vmm/src/arch/x86_64/mptable.rs +++ b/src/vmm/src/arch/x86_64/mptable.rs @@ -116,7 +116,7 @@ fn compute_mp_size(num_cpus: u8) -> usize { /// Performs setup of the MP table for the given `num_cpus`. pub fn setup_mptable( mem: &GuestMemoryMmap, - resource_allocator: &mut ResourceAllocator, + resource_allocator: &ResourceAllocator, num_cpus: u8, ) -> Result<(), MptableError> { if num_cpus > MAX_SUPPORTED_CPUS { @@ -334,27 +334,27 @@ mod tests { fn bounds_check() { let num_cpus = 4; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(num_cpus)); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); - setup_mptable(&mem, &mut resource_allocator, num_cpus).unwrap(); + setup_mptable(&mem, &resource_allocator, num_cpus).unwrap(); } #[test] fn bounds_check_fails() { let num_cpus = 4; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(num_cpus) - 1); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); - setup_mptable(&mem, &mut resource_allocator, num_cpus).unwrap_err(); + setup_mptable(&mem, &resource_allocator, num_cpus).unwrap_err(); } #[test] fn mpf_intel_checksum() { let num_cpus = 1; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(num_cpus)); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); - setup_mptable(&mem, &mut resource_allocator, num_cpus).unwrap(); + setup_mptable(&mem, &resource_allocator, num_cpus).unwrap(); let mpf_intel: mpspec::mpf_intel = mem.read_obj(GuestAddress(SYSTEM_MEM_START)).unwrap(); @@ -365,9 +365,9 @@ mod tests { fn mpc_table_checksum() { let num_cpus = 4; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(num_cpus)); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); - setup_mptable(&mem, &mut resource_allocator, num_cpus).unwrap(); + setup_mptable(&mem, &resource_allocator, num_cpus).unwrap(); let mpf_intel: mpspec::mpf_intel = mem.read_obj(GuestAddress(SYSTEM_MEM_START)).unwrap(); let mpc_offset = GuestAddress(u64::from(mpf_intel.physptr)); @@ -388,9 +388,9 @@ mod tests { fn mpc_entry_count() { let num_cpus = 1; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(num_cpus)); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); - setup_mptable(&mem, &mut resource_allocator, num_cpus).unwrap(); + setup_mptable(&mem, &resource_allocator, num_cpus).unwrap(); let mpf_intel: mpspec::mpf_intel = mem.read_obj(GuestAddress(SYSTEM_MEM_START)).unwrap(); let mpc_offset = GuestAddress(u64::from(mpf_intel.physptr)); @@ -419,8 +419,8 @@ mod tests { let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(MAX_SUPPORTED_CPUS)); for i in 0..MAX_SUPPORTED_CPUS { - let mut resource_allocator = ResourceAllocator::new().unwrap(); - setup_mptable(&mem, &mut resource_allocator, i).unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); + setup_mptable(&mem, &resource_allocator, i).unwrap(); let mpf_intel: mpspec::mpf_intel = mem.read_obj(GuestAddress(SYSTEM_MEM_START)).unwrap(); @@ -450,9 +450,9 @@ mod tests { fn cpu_entry_count_max() { let cpus = MAX_SUPPORTED_CPUS + 1; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(cpus)); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); - let result = setup_mptable(&mem, &mut resource_allocator, cpus).unwrap_err(); + let result = setup_mptable(&mem, &resource_allocator, cpus).unwrap_err(); assert_eq!(result, MptableError::TooManyCpus); } } diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 70449a64ffd..deb07ad9f91 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -169,7 +169,7 @@ impl MMIODeviceManager { /// Allocates resources for a new device to be added. fn allocate_mmio_resources( &mut self, - resource_allocator: &mut ResourceAllocator, + resource_allocator: &ResourceAllocator, irq_count: u32, ) -> Result { let irq = match resource_allocator.allocate_gsi(irq_count)?[..] { @@ -253,9 +253,8 @@ impl MMIODeviceManager { pub fn register_mmio_virtio_for_boot( &mut self, vm: &VmFd, - resource_allocator: &mut ResourceAllocator, + resource_allocator: &ResourceAllocator, device_id: String, - mmio_bus: &vm_device::Bus, mmio_device: MmioTransport, _cmdline: &mut kernel_cmdline::Cmdline, ) -> Result<(), MmioError> { @@ -276,7 +275,7 @@ impl MMIODeviceManager { device.resources.irq.unwrap().get(), )?; } - self.register_mmio_virtio(vm, device_id, mmio_bus, device)?; + self.register_mmio_virtio(vm, device_id, &resource_allocator.mmio_bus, device)?; Ok(()) } @@ -286,8 +285,7 @@ impl MMIODeviceManager { pub fn register_mmio_serial( &mut self, vm: &VmFd, - mmio_bus: &vm_device::Bus, - resource_allocator: &mut ResourceAllocator, + resource_allocator: &ResourceAllocator, serial: Arc>, device_info_opt: Option, ) -> Result<(), MmioError> { @@ -315,7 +313,7 @@ impl MMIODeviceManager { inner: serial, }; - mmio_bus.insert( + resource_allocator.mmio_bus.insert( device.inner.clone(), device.resources.addr, device.resources.len, @@ -346,8 +344,7 @@ impl MMIODeviceManager { /// given as parameter, otherwise allocate a new MMIO resources for it. pub fn register_mmio_rtc( &mut self, - mmio_bus: &vm_device::Bus, - resource_allocator: &mut ResourceAllocator, + resource_allocator: &ResourceAllocator, rtc: Arc>, device_info_opt: Option, ) -> Result<(), MmioError> { @@ -369,7 +366,7 @@ impl MMIODeviceManager { inner: rtc, }; - mmio_bus.insert( + resource_allocator.mmio_bus.insert( device.inner.clone(), device.resources.addr, device.resources.len, @@ -571,19 +568,17 @@ pub(crate) mod tests { &mut self, vm: &VmFd, guest_mem: GuestMemoryMmap, - resource_allocator: &mut ResourceAllocator, + resource_allocator: &ResourceAllocator, device: Arc>, cmdline: &mut kernel_cmdline::Cmdline, dev_id: &str, ) -> Result { let interrupt = Arc::new(IrqTrigger::new()); - let mmio_bus = vm_device::Bus::new(); let mmio_device = MmioTransport::new(guest_mem, interrupt, device.clone(), false); self.register_mmio_virtio_for_boot( vm, resource_allocator, dev_id.to_string(), - &mmio_bus, mmio_device, cmdline, )?; @@ -693,7 +688,7 @@ pub(crate) mod tests { let mut vm = Vm::new(&kvm).unwrap(); vm.register_memory_regions(guest_mem).unwrap(); let mut device_manager = MMIODeviceManager::new(); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); let dummy = Arc::new(Mutex::new(DummyDevice::new())); @@ -706,7 +701,7 @@ pub(crate) mod tests { .register_virtio_test_device( vm.fd(), vm.guest_memory().clone(), - &mut resource_allocator, + &resource_allocator, dummy, &mut cmdline, "dummy", @@ -747,7 +742,7 @@ pub(crate) mod tests { let mut vm = Vm::new(&kvm).unwrap(); vm.register_memory_regions(guest_mem).unwrap(); let mut device_manager = MMIODeviceManager::new(); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); #[cfg(target_arch = "x86_64")] @@ -760,7 +755,7 @@ pub(crate) mod tests { .register_virtio_test_device( vm.fd(), vm.guest_memory().clone(), - &mut resource_allocator, + &resource_allocator, Arc::new(Mutex::new(DummyDevice::new())), &mut cmdline, "dummy1", @@ -774,7 +769,7 @@ pub(crate) mod tests { .register_virtio_test_device( vm.fd(), vm.guest_memory().clone(), - &mut resource_allocator, + &resource_allocator, Arc::new(Mutex::new(DummyDevice::new())), &mut cmdline, "dummy2" @@ -809,7 +804,7 @@ pub(crate) mod tests { vm.setup_irqchip(1).unwrap(); let mut device_manager = MMIODeviceManager::new(); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); let dummy = Arc::new(Mutex::new(DummyDevice::new())); @@ -819,7 +814,7 @@ pub(crate) mod tests { .register_virtio_test_device( vm.fd(), vm.guest_memory().clone(), - &mut resource_allocator, + &resource_allocator, dummy, &mut cmdline, &id, @@ -850,7 +845,7 @@ pub(crate) mod tests { .register_virtio_test_device( vm.fd(), vm.guest_memory().clone(), - &mut resource_allocator, + &resource_allocator, dummy2, &mut cmdline, &id2, @@ -876,10 +871,10 @@ pub(crate) mod tests { #[test] fn test_no_irq_allocation() { let mut device_manager = MMIODeviceManager::new(); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); let device_info = device_manager - .allocate_mmio_resources(&mut resource_allocator, 0) + .allocate_mmio_resources(&resource_allocator, 0) .unwrap(); assert!(device_info.irq.is_none()); } @@ -887,10 +882,10 @@ pub(crate) mod tests { #[test] fn test_irq_allocation() { let mut device_manager = MMIODeviceManager::new(); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); let device_info = device_manager - .allocate_mmio_resources(&mut resource_allocator, 1) + .allocate_mmio_resources(&resource_allocator, 1) .unwrap(); assert_eq!(device_info.irq.unwrap().get(), crate::arch::IRQ_BASE); } @@ -898,12 +893,12 @@ pub(crate) mod tests { #[test] fn test_allocation_failure() { let mut device_manager = MMIODeviceManager::new(); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); assert_eq!( format!( "{}", device_manager - .allocate_mmio_resources(&mut resource_allocator, 2) + .allocate_mmio_resources(&resource_allocator, 2) .unwrap_err() ), "Invalid MMIO IRQ configuration.".to_string() diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index f6720233fd1..bcc71236c63 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -96,15 +96,10 @@ pub enum AttachLegacyMmioDeviceError { /// A manager of all peripheral devices of Firecracker pub struct DeviceManager { /// Allocator for system memory and interrupt numbers - pub resource_allocator: ResourceAllocator, - /// MMIO bus - pub mmio_bus: Arc, + pub resource_allocator: Arc, /// MMIO devices pub mmio_devices: MMIODeviceManager, #[cfg(target_arch = "x86_64")] - /// Port IO bus - pub pio_bus: Arc, - #[cfg(target_arch = "x86_64")] /// Legacy devices pub legacy_devices: PortIODeviceManager, /// ACPI devices @@ -145,10 +140,7 @@ impl DeviceManager { vcpu_exit_evt: &EventFd, vmfd: &VmFd, ) -> Result { - let mmio_bus = Arc::new(vm_device::Bus::new()); - - #[cfg(target_arch = "x86_64")] - let pio_bus = Arc::new(vm_device::Bus::new()); + let resource_allocator = Arc::new(ResourceAllocator::new()?); #[cfg(target_arch = "x86_64")] let legacy_devices = { Self::set_stdout_nonblocking(); @@ -163,17 +155,14 @@ impl DeviceManager { // create pio dev manager with legacy devices let mut legacy_devices = PortIODeviceManager::new(serial, i8042)?; - legacy_devices.register_devices(&pio_bus, vmfd)?; + legacy_devices.register_devices(&resource_allocator.pio_bus, vmfd)?; legacy_devices }; Ok(DeviceManager { - resource_allocator: ResourceAllocator::new()?, - mmio_bus, + resource_allocator, mmio_devices: MMIODeviceManager::new(), #[cfg(target_arch = "x86_64")] - pio_bus, - #[cfg(target_arch = "x86_64")] legacy_devices, acpi_devices: ACPIDeviceManager::new(), }) @@ -194,9 +183,8 @@ impl DeviceManager { let device = MmioTransport::new(mem.clone(), interrupt, device, is_vhost_user); self.mmio_devices.register_mmio_virtio_for_boot( vmfd, - &mut self.resource_allocator, + &self.resource_allocator, id, - &self.mmio_bus, device, cmdline, )?; @@ -212,7 +200,7 @@ impl DeviceManager { let boot_timer = Arc::new(Mutex::new(BootTimer::new(request_ts))); self.mmio_devices - .register_mmio_boot_timer(&self.mmio_bus, boot_timer)?; + .register_mmio_boot_timer(&self.resource_allocator.mmio_bus, boot_timer)?; Ok(()) } @@ -222,7 +210,7 @@ impl DeviceManager { mem: &GuestMemoryMmap, vmfd: &VmFd, ) -> Result<(), AttachVmgenidError> { - let vmgenid = VmGenId::new(mem, &mut self.resource_allocator)?; + let vmgenid = VmGenId::new(mem, &self.resource_allocator)?; self.acpi_devices.attach_vmgenid(vmgenid, vmfd)?; Ok(()) } @@ -246,23 +234,14 @@ impl DeviceManager { // Make stdout non-blocking. Self::set_stdout_nonblocking(); let serial = Self::setup_serial_device(event_manager)?; - self.mmio_devices.register_mmio_serial( - vmfd, - &self.mmio_bus, - &mut self.resource_allocator, - serial, - None, - )?; + self.mmio_devices + .register_mmio_serial(vmfd, &self.resource_allocator, serial, None)?; self.mmio_devices.add_mmio_serial_to_cmdline(cmdline)?; } let rtc = Arc::new(Mutex::new(RTCDevice::new())); - self.mmio_devices.register_mmio_rtc( - &self.mmio_bus, - &mut self.resource_allocator, - rtc, - None, - )?; + self.mmio_devices + .register_mmio_rtc(&self.resource_allocator, rtc, None)?; Ok(()) } } @@ -352,11 +331,10 @@ impl DeviceManager { ) -> Result<(), DevicePersistError> { // Restore MMIO devices let mmio_ctor_args = MMIODevManagerConstructorArgs { - mmio_bus: &self.mmio_bus, mem: restore_args.mem, vm: restore_args.vm, event_manager: restore_args.event_manager, - resource_allocator: &mut self.resource_allocator, + resource_allocator: &self.resource_allocator, vm_resources: restore_args.vm_resources, instance_id: restore_args.instance_id, restored_from_file: restore_args.restored_from_file, @@ -370,7 +348,7 @@ impl DeviceManager { // Restore ACPI devices let acpi_ctor_args = ACPIDeviceManagerConstructorArgs { mem: restore_args.mem, - resource_allocator: &mut self.resource_allocator, + resource_allocator: &self.resource_allocator, vm: restore_args.vm, }; self.acpi_devices = ACPIDeviceManager::restore(acpi_ctor_args, &state.acpi_state)?; @@ -387,12 +365,9 @@ pub(crate) mod tests { use crate::builder::tests::default_vmm; pub(crate) fn default_device_manager() -> DeviceManager { - let mmio_bus = Arc::new(vm_device::Bus::new()); - #[cfg(target_arch = "x86_64")] - let pio_bus = Arc::new(vm_device::Bus::new()); let mmio_devices = MMIODeviceManager::new(); let acpi_devices = ACPIDeviceManager::new(); - let resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = Arc::new(ResourceAllocator::new().unwrap()); #[cfg(target_arch = "x86_64")] let legacy_devices = PortIODeviceManager::new( @@ -407,11 +382,8 @@ pub(crate) mod tests { DeviceManager { resource_allocator, - mmio_bus, mmio_devices, #[cfg(target_arch = "x86_64")] - pio_bus, - #[cfg(target_arch = "x86_64")] legacy_devices, acpi_devices, } diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 5d0bd8051e9..e3c7d2a8475 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -214,11 +214,10 @@ pub enum SharedDeviceType { } pub struct MMIODevManagerConstructorArgs<'a> { - pub mmio_bus: &'a vm_device::Bus, pub mem: &'a GuestMemoryMmap, pub vm: &'a VmFd, pub event_manager: &'a mut EventManager, - pub resource_allocator: &'a mut ResourceAllocator, + pub resource_allocator: &'a ResourceAllocator, pub vm_resources: &'a mut VmResources, pub instance_id: &'a str, pub restored_from_file: bool, @@ -243,7 +242,7 @@ pub struct ACPIDeviceManagerState { pub struct ACPIDeviceManagerConstructorArgs<'a> { pub mem: &'a GuestMemoryMmap, - pub resource_allocator: &'a mut ResourceAllocator, + pub resource_allocator: &'a ResourceAllocator, pub vm: &'a VmFd, } @@ -435,7 +434,6 @@ impl<'a> Persist<'a> for MMIODeviceManager { dev_manager.register_mmio_serial( vm, - constructor_args.mmio_bus, constructor_args.resource_allocator, serial, Some(state.device_info), @@ -444,7 +442,6 @@ impl<'a> Persist<'a> for MMIODeviceManager { if state.type_ == DeviceType::Rtc { let rtc = Arc::new(Mutex::new(RTCDevice::new())); dev_manager.register_mmio_rtc( - constructor_args.mmio_bus, constructor_args.resource_allocator, rtc, Some(state.device_info), @@ -532,7 +529,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { &balloon_state.transport_state, interrupt, &balloon_state.device_info, - constructor_args.mmio_bus, + &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, )?; } @@ -559,7 +556,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { &block_state.transport_state, interrupt, &block_state.device_info, - constructor_args.mmio_bus, + &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, )?; } @@ -608,7 +605,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { &net_state.transport_state, interrupt, &net_state.device_info, - constructor_args.mmio_bus, + &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, )?; } @@ -640,7 +637,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { &vsock_state.transport_state, interrupt, &vsock_state.device_info, - constructor_args.mmio_bus, + &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, )?; } @@ -666,7 +663,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { &entropy_state.transport_state, interrupt, &entropy_state.device_info, - constructor_args.mmio_bus, + &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, )?; } @@ -755,7 +752,7 @@ mod tests { // These need to survive so the restored blocks find them. let _block_files; let mut tmp_sock_file = TempFile::new().unwrap(); - let mut resource_allocator = ResourceAllocator::new().unwrap(); + let resource_allocator = ResourceAllocator::new().unwrap(); tmp_sock_file.remove().unwrap(); // Set up a vmm with one of each device, and get the serialized DeviceStates. { @@ -818,11 +815,10 @@ mod tests { let device_states: DeviceStates = Snapshot::deserialize(&mut buf.as_slice()).unwrap(); let vm_resources = &mut VmResources::default(); let restore_args = MMIODevManagerConstructorArgs { - mmio_bus: &vmm.device_manager.mmio_bus, mem: vmm.vm.guest_memory(), vm: vmm.vm.fd(), event_manager: &mut event_manager, - resource_allocator: &mut resource_allocator, + resource_allocator: &resource_allocator, vm_resources, instance_id: "microvm-id", restored_from_file: true, diff --git a/src/vmm/src/device_manager/resources.rs b/src/vmm/src/device_manager/resources.rs index 65087b6fa16..249d0507ba8 100644 --- a/src/vmm/src/device_manager/resources.rs +++ b/src/vmm/src/device_manager/resources.rs @@ -1,8 +1,12 @@ // Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +use std::sync::{Arc, Mutex}; + +use pci::DeviceRelocation; pub use vm_allocator::AllocPolicy; use vm_allocator::{AddressAllocator, IdAllocator}; +use vm_device::Bus; use crate::arch; @@ -16,29 +20,40 @@ use crate::arch; #[derive(Debug)] pub struct ResourceAllocator { // Allocator for device interrupt lines - gsi_allocator: IdAllocator, + pub gsi_allocator: Arc>, // Allocator for memory in the 32-bit MMIO address space - mmio32_memory: AddressAllocator, + pub mmio32_memory: Arc>, // Allocator for memory in the 64-bit MMIO address space - mmio64_memory: AddressAllocator, + pub mmio64_memory: Arc>, // Memory allocator for system data - system_memory: AddressAllocator, + pub system_memory: Arc>, + /// MMIO bus + pub mmio_bus: Arc, + #[cfg(target_arch = "x86_64")] + /// Port IO bus + pub pio_bus: Arc, } impl ResourceAllocator { /// Create a new resource allocator for Firecracker devices pub fn new() -> Result { Ok(Self { - gsi_allocator: IdAllocator::new(arch::IRQ_BASE, arch::IRQ_MAX)?, - mmio32_memory: AddressAllocator::new( + gsi_allocator: Arc::new(Mutex::new(IdAllocator::new(arch::IRQ_BASE, arch::IRQ_MAX)?)), + mmio32_memory: Arc::new(Mutex::new(AddressAllocator::new( arch::MEM_32BIT_DEVICES_START, arch::MEM_32BIT_DEVICES_SIZE, - )?, - mmio64_memory: AddressAllocator::new( + )?)), + mmio64_memory: Arc::new(Mutex::new(AddressAllocator::new( arch::MEM_64BIT_DEVICES_START, arch::MEM_64BIT_DEVICES_SIZE, - )?, - system_memory: AddressAllocator::new(arch::SYSTEM_MEM_START, arch::SYSTEM_MEM_SIZE)?, + )?)), + system_memory: Arc::new(Mutex::new(AddressAllocator::new( + arch::SYSTEM_MEM_START, + arch::SYSTEM_MEM_SIZE, + )?)), + mmio_bus: Arc::new(Bus::new()), + #[cfg(target_arch = "x86_64")] + pio_bus: Arc::new(Bus::new()), }) } @@ -47,16 +62,17 @@ impl ResourceAllocator { /// # Arguments /// /// * `gsi_count` - The number of GSIs to allocate - pub fn allocate_gsi(&mut self, gsi_count: u32) -> Result, vm_allocator::Error> { + pub fn allocate_gsi(&self, gsi_count: u32) -> Result, vm_allocator::Error> { + let mut gsi_allocator = self.gsi_allocator.lock().expect("Poisoned lock"); let mut gsis = Vec::with_capacity(gsi_count as usize); for _ in 0..gsi_count { - match self.gsi_allocator.allocate_id() { + match gsi_allocator.allocate_id() { Ok(gsi) => gsis.push(gsi), Err(err) => { // It is ok to unwrap here, we just allocated the GSI gsis.into_iter().for_each(|gsi| { - self.gsi_allocator.free_id(gsi).unwrap(); + gsi_allocator.free_id(gsi).unwrap(); }); return Err(err); } @@ -76,13 +92,15 @@ impl ResourceAllocator { /// * `alignment` - The alignment of the address of the first byte /// * `policy` - A [`vm_allocator::AllocPolicy`] variant for determining the allocation policy pub fn allocate_32bit_mmio_memory( - &mut self, + &self, size: u64, alignment: u64, policy: AllocPolicy, ) -> Result { Ok(self .mmio32_memory + .lock() + .expect("Poisoned lock") .allocate(size, alignment, policy)? .start()) } @@ -97,13 +115,15 @@ impl ResourceAllocator { /// * `alignment` - The alignment of the address of the first byte /// * `policy` - A [`vm_allocator::AllocPolicy`] variant for determining the allocation policy pub fn allocate_64bit_mmio_memory( - &mut self, + &self, size: u64, alignment: u64, policy: AllocPolicy, ) -> Result { Ok(self .mmio64_memory + .lock() + .expect("Poisoned lock") .allocate(size, alignment, policy)? .start()) } @@ -118,18 +138,33 @@ impl ResourceAllocator { /// * `alignment` - The alignment of the address of the first byte /// * `policy` - A [`vm_allocator::AllocPolicy`] variant for determining the allocation policy pub fn allocate_system_memory( - &mut self, + &self, size: u64, alignment: u64, policy: AllocPolicy, ) -> Result { Ok(self .system_memory + .lock() + .expect("Poisoned lock") .allocate(size, alignment, policy)? .start()) } } +impl DeviceRelocation for ResourceAllocator { + fn move_bar( + &self, + _old_base: u64, + _new_base: u64, + _len: u64, + _pci_dev: &mut dyn pci::PciDevice, + _region_type: pci::PciBarRegionType, + ) -> Result<(), std::io::Error> { + todo!() + } +} + #[cfg(test)] mod tests { use super::ResourceAllocator; @@ -139,7 +174,7 @@ mod tests { #[test] fn test_allocate_gsi() { - let mut allocator = ResourceAllocator::new().unwrap(); + let allocator = ResourceAllocator::new().unwrap(); // asking for 0 IRQs should return us an empty vector assert_eq!(allocator.allocate_gsi(0), Ok(vec![])); // We cannot allocate more GSIs than available @@ -160,7 +195,7 @@ mod tests { // But we should be able to ask for 0 GSIs assert_eq!(allocator.allocate_gsi(0), Ok(vec![])); - let mut allocator = ResourceAllocator::new().unwrap(); + let allocator = ResourceAllocator::new().unwrap(); // We should be able to allocate 1 GSI assert_eq!(allocator.allocate_gsi(1), Ok(vec![arch::IRQ_BASE])); // We can't allocate MAX_IRQS any more diff --git a/src/vmm/src/devices/acpi/vmgenid.rs b/src/vmm/src/devices/acpi/vmgenid.rs index 31dbf64ec39..df0656bfbcc 100644 --- a/src/vmm/src/devices/acpi/vmgenid.rs +++ b/src/vmm/src/devices/acpi/vmgenid.rs @@ -86,7 +86,7 @@ impl VmGenId { /// Allocate memory and a GSI for sending notifications and build the device pub fn new( mem: &GuestMemoryMmap, - resource_allocator: &mut ResourceAllocator, + resource_allocator: &ResourceAllocator, ) -> Result { let gsi = resource_allocator.allocate_gsi(1)?; // The generation ID needs to live in an 8-byte aligned buffer @@ -133,7 +133,7 @@ pub struct VMGenIDState { #[derive(Debug)] pub struct VMGenIdConstructorArgs<'a> { pub mem: &'a GuestMemoryMmap, - pub resource_allocator: &'a mut ResourceAllocator, + pub resource_allocator: &'a ResourceAllocator, } impl<'a> Persist<'a> for VmGenId { diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 30104890e7d..01ef9547d82 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -371,10 +371,10 @@ impl Vmm { self.vcpus_handles.reserve(vcpu_count); for mut vcpu in vcpus.drain(..) { - vcpu.set_mmio_bus(self.device_manager.mmio_bus.clone()); + vcpu.set_mmio_bus(self.device_manager.resource_allocator.mmio_bus.clone()); #[cfg(target_arch = "x86_64")] vcpu.kvm_vcpu - .set_pio_bus(self.device_manager.pio_bus.clone()); + .set_pio_bus(self.device_manager.resource_allocator.pio_bus.clone()); self.vcpus_handles .push(vcpu.start_threaded(vcpu_seccomp_filter.clone(), barrier.clone())?); From 7e60fc30a914973e496001a5b6c8c1c62620d6dd Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 6 May 2025 17:32:40 +0200 Subject: [PATCH 17/56] pci: add support for PCIe segment Add a PCIe segment which includes a single PCIe root port and a bus. At the moment, the PCIe segment is always enabled. Later commit will make it optional and enable it only when a command line argument flag is passed to Firecracker binary. Signed-off-by: Babis Chalios --- src/firecracker/src/main.rs | 2 +- src/vmm/src/acpi/mod.rs | 4 + src/vmm/src/arch/mod.rs | 4 +- src/vmm/src/builder.rs | 5 + src/vmm/src/device_manager/mod.rs | 14 + src/vmm/src/device_manager/pci_mngr.rs | 45 +++ src/vmm/src/devices/mod.rs | 3 + src/vmm/src/devices/pci/mod.rs | 6 + src/vmm/src/devices/pci/pci_segment.rs | 464 +++++++++++++++++++++++++ 9 files changed, 545 insertions(+), 2 deletions(-) create mode 100644 src/vmm/src/device_manager/pci_mngr.rs create mode 100644 src/vmm/src/devices/pci/mod.rs create mode 100644 src/vmm/src/devices/pci/pci_segment.rs diff --git a/src/firecracker/src/main.rs b/src/firecracker/src/main.rs index 6b01f776729..4d6536d054c 100644 --- a/src/firecracker/src/main.rs +++ b/src/firecracker/src/main.rs @@ -449,7 +449,7 @@ fn main_exec() -> Result<(), MainError> { /// the default the jailer would set). /// /// We do this resizing because the kernel default is 64, with a reallocation happening whenever -/// the tabel fills up. This was happening for some larger microVMs, and reallocating the +/// the table fills up. This was happening for some larger microVMs, and reallocating the /// fdtable while a lot of file descriptors are active (due to being eventfds/timerfds registered /// to epoll) incurs a penalty of 30ms-70ms on the snapshot restore path. fn resize_fdtable() -> Result<(), ResizeFdTableError> { diff --git a/src/vmm/src/acpi/mod.rs b/src/vmm/src/acpi/mod.rs index a9b9b2bfb28..65075781188 100644 --- a/src/vmm/src/acpi/mod.rs +++ b/src/vmm/src/acpi/mod.rs @@ -90,6 +90,10 @@ impl AcpiTableWriter<'_> { .acpi_devices .append_aml_bytes(&mut dsdt_data)?; + if let Some(pci_segment) = &device_manager.pci_devices.pci_segment { + pci_segment.append_aml_bytes(&mut dsdt_data)?; + } + // Architecture specific DSDT data setup_arch_dsdt(&mut dsdt_data)?; diff --git a/src/vmm/src/arch/mod.rs b/src/vmm/src/arch/mod.rs index 9ed6d1a4190..3693feed04b 100644 --- a/src/vmm/src/arch/mod.rs +++ b/src/vmm/src/arch/mod.rs @@ -47,7 +47,9 @@ pub use crate::arch::x86_64::{ layout::IOAPIC_ADDR, layout::IRQ_BASE, layout::IRQ_MAX, layout::MEM_32BIT_DEVICES_SIZE, layout::MEM_32BIT_DEVICES_START, layout::MEM_64BIT_DEVICES_SIZE, layout::MEM_64BIT_DEVICES_START, layout::MMIO32_MEM_SIZE, layout::MMIO32_MEM_START, - layout::SYSTEM_MEM_SIZE, layout::SYSTEM_MEM_START, load_kernel, + layout::PCI_MMCONFIG_SIZE, layout::PCI_MMCONFIG_START, + layout::PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, layout::SYSTEM_MEM_SIZE, layout::SYSTEM_MEM_START, + load_kernel, }; /// Types of devices that can get attached to this platform. diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 171eca7f95e..57491042968 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -24,6 +24,7 @@ use crate::cpu_config::templates::{ }; #[cfg(target_arch = "aarch64")] use crate::device_manager::AttachLegacyMmioDeviceError; +use crate::device_manager::pci_mngr::PciManagerError; use crate::device_manager::{ AttachMmioDeviceError, AttachVmgenidError, DeviceManager, DevicePersistError, DeviceRestoreArgs, }; @@ -71,6 +72,8 @@ pub enum StartMicrovmError { CreateLegacyDevice(device_manager::legacy::LegacyDeviceError), /// Error creating VMGenID device: {0} CreateVMGenID(VmGenIdError), + /// Error enabling PCIe support: {0} + EnablePciDevices(#[from] PciManagerError), /// Error enabling pvtime on vcpu: {0} #[cfg(target_arch = "aarch64")] EnablePVTime(crate::arch::VcpuArchError), @@ -214,6 +217,8 @@ pub fn build_microvm_for_boot( .map(|vcpu| vcpu.copy_kvm_vcpu_fd(vmm.vm())) .collect::, _>>()?; + vmm.device_manager.enable_pci()?; + // The boot timer device needs to be the first device attached in order // to maintain the same MMIO address referenced in the documentation // and tests. diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index bcc71236c63..5c01a195fc5 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -16,6 +16,7 @@ use legacy::{LegacyDeviceError, PortIODeviceManager}; use linux_loader::loader::Cmdline; use log::error; use mmio::{MMIODeviceManager, MmioError}; +use pci_mngr::{PciDevices, PciManagerError}; use persist::{ACPIDeviceManagerConstructorArgs, MMIODevManagerConstructorArgs}; use resources::ResourceAllocator; use serde::{Deserialize, Serialize}; @@ -43,6 +44,8 @@ pub mod acpi; pub mod legacy; /// Memory Mapped I/O Manager. pub mod mmio; +/// PCIe device manager +pub mod pci_mngr; /// Device managers (de)serialization support. pub mod persist; /// Resource manager for devices. @@ -104,6 +107,8 @@ pub struct DeviceManager { pub legacy_devices: PortIODeviceManager, /// ACPI devices pub acpi_devices: ACPIDeviceManager, + /// PCIe devices + pub pci_devices: PciDevices, } impl DeviceManager { @@ -165,6 +170,7 @@ impl DeviceManager { #[cfg(target_arch = "x86_64")] legacy_devices, acpi_devices: ACPIDeviceManager::new(), + pci_devices: PciDevices::new(), }) } @@ -244,6 +250,12 @@ impl DeviceManager { .register_mmio_rtc(&self.resource_allocator, rtc, None)?; Ok(()) } + + /// Enables PCIe support for Firecracker devices + pub fn enable_pci(&mut self) -> Result<(), PciManagerError> { + self.pci_devices + .attach_pci_segment(&self.resource_allocator) + } } #[derive(Debug, Default, Clone, Serialize, Deserialize)] @@ -367,6 +379,7 @@ pub(crate) mod tests { pub(crate) fn default_device_manager() -> DeviceManager { let mmio_devices = MMIODeviceManager::new(); let acpi_devices = ACPIDeviceManager::new(); + let pci_devices = PciDevices::new(); let resource_allocator = Arc::new(ResourceAllocator::new().unwrap()); #[cfg(target_arch = "x86_64")] @@ -386,6 +399,7 @@ pub(crate) mod tests { #[cfg(target_arch = "x86_64")] legacy_devices, acpi_devices, + pci_devices, } } diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs new file mode 100644 index 00000000000..c3bf2ada977 --- /dev/null +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -0,0 +1,45 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::sync::Arc; + +use vm_device::BusError; + +use super::resources::ResourceAllocator; +use crate::devices::pci::PciSegment; + +#[derive(Debug, Default)] +pub struct PciDevices { + /// PCIe segment of the VMM, if PCI is enabled. We currently support a single PCIe segment. + pub pci_segment: Option, +} + +#[derive(Debug, thiserror::Error, displaydoc::Display)] +pub enum PciManagerError { + /// Resource allocation error: {0} + ResourceAllocation(#[from] vm_allocator::Error), + /// Bus error: {0} + Bus(#[from] BusError), +} + +impl PciDevices { + pub fn new() -> Self { + Default::default() + } + + pub fn attach_pci_segment( + &mut self, + resource_allocator: &Arc, + ) -> Result<(), PciManagerError> { + // We only support a single PCIe segment. Calling this function twice is a Firecracker + // internal error. + assert!(self.pci_segment.is_none()); + + // Currently we don't assign any IRQs to PCI devices. We will be using MSI-X interrupts + // only. + let pci_segment = PciSegment::new(0, resource_allocator, &[0u8; 32])?; + self.pci_segment = Some(pci_segment); + + Ok(()) + } +} diff --git a/src/vmm/src/devices/mod.rs b/src/vmm/src/devices/mod.rs index dd58acc9337..371cc2cfa9e 100644 --- a/src/vmm/src/devices/mod.rs +++ b/src/vmm/src/devices/mod.rs @@ -7,10 +7,13 @@ //! Emulates virtual and hardware devices. +#![allow(unused)] + use std::io; pub mod acpi; pub mod legacy; +pub mod pci; pub mod pseudo; pub mod virtio; diff --git a/src/vmm/src/devices/pci/mod.rs b/src/vmm/src/devices/pci/mod.rs new file mode 100644 index 00000000000..e365b481893 --- /dev/null +++ b/src/vmm/src/devices/pci/mod.rs @@ -0,0 +1,6 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +pub mod pci_segment; + +pub use pci_segment::*; diff --git a/src/vmm/src/devices/pci/pci_segment.rs b/src/vmm/src/devices/pci/pci_segment.rs new file mode 100644 index 00000000000..169ffdcba3b --- /dev/null +++ b/src/vmm/src/devices/pci/pci_segment.rs @@ -0,0 +1,464 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// Portions Copyright 2017 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// Copyright © 2019 - 2021 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause +// + +use std::sync::{Arc, Mutex}; + +#[cfg(target_arch = "x86_64")] +use acpi_tables::{Aml, aml}; +use log::info; +#[cfg(target_arch = "x86_64")] +use pci::{PCI_CONFIG_IO_PORT, PCI_CONFIG_IO_PORT_SIZE, PciConfigIo}; +use pci::{PciBdf, PciBus, PciConfigMmio, PciRoot, PciRootError}; +use uuid::Uuid; +use vm_allocator::AddressAllocator; +use vm_device::{BusDeviceSync, BusError}; + +use crate::arch::{PCI_MMCONFIG_START, PCI_MMIO_CONFIG_SIZE_PER_SEGMENT}; +use crate::device_manager::resources::ResourceAllocator; + +pub struct PciSegment { + pub(crate) id: u16, + pub(crate) pci_bus: Arc>, + pub(crate) pci_config_mmio: Arc>, + pub(crate) mmio_config_address: u64, + pub(crate) proximity_domain: u32, + + #[cfg(target_arch = "x86_64")] + pub(crate) pci_config_io: Option>>, + + // Bitmap of PCI devices to hotplug. + pub(crate) pci_devices_up: u32, + // Bitmap of PCI devices to hotunplug. + pub(crate) pci_devices_down: u32, + // List of allocated IRQs for each PCI slot. + pub(crate) pci_irq_slots: [u8; 32], + + // Device memory covered by this segment + pub(crate) start_of_mem32_area: u64, + pub(crate) end_of_mem32_area: u64, + + pub(crate) start_of_mem64_area: u64, + pub(crate) end_of_mem64_area: u64, +} + +impl std::fmt::Debug for PciSegment { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("PciSegment") + .field("id", &self.id) + .field("mmio_config_address", &self.mmio_config_address) + .field("proximity_domain", &self.proximity_domain) + .field("pci_devices_up", &self.pci_devices_up) + .field("pci_devices_down", &self.pci_devices_down) + .field("pci_irq_slots", &self.pci_irq_slots) + .field("start_of_mem32_area", &self.start_of_mem32_area) + .field("end_of_mem32_area", &self.end_of_mem32_area) + .field("start_of_mem64_area", &self.start_of_mem64_area) + .field("end_of_mem64_area", &self.end_of_mem64_area) + .finish() + } +} + +impl PciSegment { + fn build( + id: u16, + resource_allocator: &Arc, + pci_irq_slots: &[u8; 32], + ) -> Result { + let pci_root = PciRoot::new(None); + let pci_bus = Arc::new(Mutex::new(PciBus::new( + pci_root, + resource_allocator.clone(), + ))); + + let pci_config_mmio = Arc::new(Mutex::new(PciConfigMmio::new(Arc::clone(&pci_bus)))); + let mmio_config_address = PCI_MMCONFIG_START + PCI_MMIO_CONFIG_SIZE_PER_SEGMENT * id as u64; + + resource_allocator.mmio_bus.insert( + Arc::clone(&pci_config_mmio) as Arc, + mmio_config_address, + PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, + )?; + + let mem32_allocator = resource_allocator.mmio32_memory.clone(); + let mem64_allocator = resource_allocator.mmio64_memory.clone(); + + let start_of_mem32_area = mem32_allocator.lock().unwrap().base(); + let end_of_mem32_area = mem32_allocator.lock().unwrap().end(); + + let start_of_mem64_area = mem64_allocator.lock().unwrap().base(); + let end_of_mem64_area = mem64_allocator.lock().unwrap().end(); + + let segment = PciSegment { + id, + pci_bus, + pci_config_mmio, + mmio_config_address, + proximity_domain: 0, + pci_devices_up: 0, + pci_devices_down: 0, + #[cfg(target_arch = "x86_64")] + pci_config_io: None, + start_of_mem32_area, + end_of_mem32_area, + start_of_mem64_area, + end_of_mem64_area, + pci_irq_slots: *pci_irq_slots, + }; + + Ok(segment) + } + + #[cfg(target_arch = "x86_64")] + pub(crate) fn new( + id: u16, + resource_allocator: &Arc, + pci_irq_slots: &[u8; 32], + ) -> Result { + let mut segment = Self::build(id, resource_allocator, pci_irq_slots)?; + let pci_config_io = Arc::new(Mutex::new(PciConfigIo::new(Arc::clone(&segment.pci_bus)))); + + resource_allocator.pio_bus.insert( + pci_config_io.clone(), + PCI_CONFIG_IO_PORT, + PCI_CONFIG_IO_PORT_SIZE, + )?; + + segment.pci_config_io = Some(pci_config_io); + + info!( + "pci: adding PCI segment: id={:#x}, PCI MMIO config address: {:#x}, mem32 area: \ + [{:#x}-{:#x}], mem64 area: [{:#x}-{:#x}] IO area: [{PCI_CONFIG_IO_PORT:#x}-{:#x}]", + segment.id, + segment.mmio_config_address, + segment.start_of_mem32_area, + segment.end_of_mem32_area, + segment.start_of_mem64_area, + segment.end_of_mem64_area, + PCI_CONFIG_IO_PORT + PCI_CONFIG_IO_PORT_SIZE - 1 + ); + + Ok(segment) + } + + #[cfg(target_arch = "aarch64")] + pub(crate) fn new( + id: u16, + resource_allocator: &Arc, + pci_irq_slots: &[u8; 32], + ) -> Result { + let segment = Self::build(id, resource_allocator, pci_irq_slots)?; + info!( + "pci: adding PCI segment: id={:#x}, PCI MMIO config address: {:#x}, mem32 area: \ + [{:#x}-{:#x}], mem64 area: [{:#x}-{:#x}]", + segment.id, + segment.mmio_config_address, + segment.start_of_mem32_area, + segment.end_of_mem32_area, + segment.start_of_mem64_area, + segment.end_of_mem64_area, + ); + + Ok(segment) + } + + pub(crate) fn next_device_bdf(&self) -> Result { + Ok(PciBdf::new( + self.id, + 0, + self.pci_bus + .lock() + .unwrap() + .next_device_id()? + .try_into() + .unwrap(), + 0, + )) + } +} + +#[cfg(target_arch = "x86_64")] +struct PciDevSlot { + device_id: u8, +} + +#[cfg(target_arch = "x86_64")] +impl Aml for PciDevSlot { + fn append_aml_bytes(&self, v: &mut Vec) -> Result<(), aml::AmlError> { + let sun = self.device_id; + let adr: u32 = (self.device_id as u32) << 16; + aml::Device::new( + format!("S{:03}", self.device_id).as_str().try_into()?, + vec![ + &aml::Name::new("_SUN".try_into()?, &sun)?, + &aml::Name::new("_ADR".try_into()?, &adr)?, + &aml::Method::new( + "_EJ0".try_into()?, + 1, + true, + vec![&aml::MethodCall::new( + "\\_SB_.PHPR.PCEJ".try_into()?, + vec![&aml::Path::new("_SUN")?, &aml::Path::new("_SEG")?], + )], + ), + ], + ) + .append_aml_bytes(v) + } +} + +#[cfg(target_arch = "x86_64")] +struct PciDevSlotNotify { + device_id: u8, +} + +#[cfg(target_arch = "x86_64")] +impl Aml for PciDevSlotNotify { + fn append_aml_bytes(&self, v: &mut Vec) -> Result<(), aml::AmlError> { + let device_id_mask: u32 = 1 << self.device_id; + let object = aml::Path::new(&format!("S{:03}", self.device_id))?; + aml::And::new(&aml::Local(0), &aml::Arg(0), &device_id_mask).append_aml_bytes(v)?; + aml::If::new( + &aml::Equal::new(&aml::Local(0), &device_id_mask), + vec![&aml::Notify::new(&object, &aml::Arg(1))], + ) + .append_aml_bytes(v) + } +} + +#[cfg(target_arch = "x86_64")] +struct PciDevSlotMethods {} + +#[cfg(target_arch = "x86_64")] +impl Aml for PciDevSlotMethods { + fn append_aml_bytes(&self, v: &mut Vec) -> Result<(), aml::AmlError> { + let mut device_notifies = Vec::new(); + for device_id in 0..32 { + device_notifies.push(PciDevSlotNotify { device_id }); + } + + let mut device_notifies_refs: Vec<&dyn Aml> = Vec::new(); + for device_notify in device_notifies.iter() { + device_notifies_refs.push(device_notify); + } + + aml::Method::new("DVNT".try_into()?, 2, true, device_notifies_refs).append_aml_bytes(v)?; + aml::Method::new( + "PCNT".try_into()?, + 0, + true, + vec![ + &aml::Acquire::new("\\_SB_.PHPR.BLCK".try_into()?, 0xffff), + &aml::Store::new( + &aml::Path::new("\\_SB_.PHPR.PSEG")?, + &aml::Path::new("_SEG")?, + ), + &aml::MethodCall::new( + "DVNT".try_into()?, + vec![&aml::Path::new("\\_SB_.PHPR.PCIU")?, &aml::ONE], + ), + &aml::MethodCall::new( + "DVNT".try_into()?, + vec![&aml::Path::new("\\_SB_.PHPR.PCID")?, &3usize], + ), + &aml::Release::new("\\_SB_.PHPR.BLCK".try_into()?), + ], + ) + .append_aml_bytes(v) + } +} + +#[cfg(target_arch = "x86_64")] +struct PciDsmMethod {} + +#[cfg(target_arch = "x86_64")] +impl Aml for PciDsmMethod { + fn append_aml_bytes(&self, v: &mut Vec) -> Result<(), aml::AmlError> { + // Refer to ACPI spec v6.3 Ch 9.1.1 and PCI Firmware spec v3.3 Ch 4.6.1 + // _DSM (Device Specific Method), the following is the implementation in ASL. + + // Method (_DSM, 4, NotSerialized) // _DSM: Device-Specific Method + // { + // If ((Arg0 == ToUUID ("e5c937d0-3553-4d7a-9117-ea4d19c3434d") /* Device Labeling + // Interface */)) { + // If ((Arg2 == Zero)) + // { + // Return (Buffer (One) { 0x21 }) + // } + // If ((Arg2 == 0x05)) + // { + // Return (Zero) + // } + // } + // + // Return (Buffer (One) { 0x00 }) + // } + // + // As per ACPI v6.3 Ch 19.6.142, the UUID is required to be in mixed endian: + // Among the fields of a UUID: + // {d1 (8 digits)} - {d2 (4 digits)} - {d3 (4 digits)} - {d4 (16 digits)} + // d1 ~ d3 need to be little endian, d4 be big endian. + // See https://en.wikipedia.org/wiki/Universally_unique_identifier#Encoding . + let uuid = Uuid::parse_str("E5C937D0-3553-4D7A-9117-EA4D19C3434D").unwrap(); + let (uuid_d1, uuid_d2, uuid_d3, uuid_d4) = uuid.as_fields(); + let mut uuid_buf = vec![]; + uuid_buf.extend(uuid_d1.to_le_bytes()); + uuid_buf.extend(uuid_d2.to_le_bytes()); + uuid_buf.extend(uuid_d3.to_le_bytes()); + uuid_buf.extend(uuid_d4); + aml::Method::new( + "_DSM".try_into()?, + 4, + false, + vec![ + &aml::If::new( + &aml::Equal::new(&aml::Arg(0), &aml::Buffer::new(uuid_buf)), + vec![ + &aml::If::new( + &aml::Equal::new(&aml::Arg(2), &aml::ZERO), + vec![&aml::Return::new(&aml::Buffer::new(vec![0x21]))], + ), + &aml::If::new( + &aml::Equal::new(&aml::Arg(2), &0x05u8), + vec![&aml::Return::new(&aml::ZERO)], + ), + ], + ), + &aml::Return::new(&aml::Buffer::new(vec![0])), + ], + ) + .append_aml_bytes(v) + } +} + +#[cfg(target_arch = "x86_64")] +impl Aml for PciSegment { + fn append_aml_bytes(&self, v: &mut Vec) -> Result<(), aml::AmlError> { + let mut pci_dsdt_inner_data: Vec<&dyn Aml> = Vec::new(); + let hid = aml::Name::new("_HID".try_into()?, &aml::EisaName::new("PNP0A08")?)?; + pci_dsdt_inner_data.push(&hid); + let cid = aml::Name::new("_CID".try_into()?, &aml::EisaName::new("PNP0A03")?)?; + pci_dsdt_inner_data.push(&cid); + let adr = aml::Name::new("_ADR".try_into()?, &aml::ZERO)?; + pci_dsdt_inner_data.push(&adr); + let seg = aml::Name::new("_SEG".try_into()?, &self.id)?; + pci_dsdt_inner_data.push(&seg); + let uid = aml::Name::new("_UID".try_into()?, &aml::ZERO)?; + pci_dsdt_inner_data.push(&uid); + let cca = aml::Name::new("_CCA".try_into()?, &aml::ONE)?; + pci_dsdt_inner_data.push(&cca); + let supp = aml::Name::new("SUPP".try_into()?, &aml::ZERO)?; + pci_dsdt_inner_data.push(&supp); + + let proximity_domain = self.proximity_domain; + let pxm_return = aml::Return::new(&proximity_domain); + let pxm = aml::Method::new("_PXM".try_into()?, 0, false, vec![&pxm_return]); + pci_dsdt_inner_data.push(&pxm); + + let pci_dsm = PciDsmMethod {}; + pci_dsdt_inner_data.push(&pci_dsm); + + #[allow(clippy::if_same_then_else)] + let crs = if self.id == 0 { + aml::Name::new( + "_CRS".try_into()?, + &aml::ResourceTemplate::new(vec![ + &aml::AddressSpace::new_bus_number(0x0u16, 0x0u16)?, + &aml::Io::new(0xcf8, 0xcf8, 1, 0x8), + &aml::Memory32Fixed::new( + true, + self.mmio_config_address.try_into().unwrap(), + PCI_MMIO_CONFIG_SIZE_PER_SEGMENT.try_into().unwrap(), + ), + &aml::AddressSpace::new_memory( + aml::AddressSpaceCacheable::NotCacheable, + true, + self.start_of_mem32_area, + self.end_of_mem32_area, + )?, + &aml::AddressSpace::new_memory( + aml::AddressSpaceCacheable::NotCacheable, + true, + self.start_of_mem64_area, + self.end_of_mem64_area, + )?, + &aml::AddressSpace::new_io(0u16, 0x0cf7u16)?, + &aml::AddressSpace::new_io(0x0d00u16, 0xffffu16)?, + ]), + )? + } else { + aml::Name::new( + "_CRS".try_into()?, + &aml::ResourceTemplate::new(vec![ + &aml::AddressSpace::new_bus_number(0x0u16, 0x0u16)?, + &aml::Memory32Fixed::new( + true, + self.mmio_config_address.try_into().unwrap(), + PCI_MMIO_CONFIG_SIZE_PER_SEGMENT.try_into().unwrap(), + ), + &aml::AddressSpace::new_memory( + aml::AddressSpaceCacheable::NotCacheable, + true, + self.start_of_mem32_area, + self.end_of_mem32_area, + )?, + &aml::AddressSpace::new_memory( + aml::AddressSpaceCacheable::NotCacheable, + true, + self.start_of_mem64_area, + self.end_of_mem64_area, + )?, + ]), + )? + }; + pci_dsdt_inner_data.push(&crs); + + let mut pci_devices = Vec::new(); + for device_id in 0..32 { + let pci_device = PciDevSlot { device_id }; + pci_devices.push(pci_device); + } + for pci_device in pci_devices.iter() { + pci_dsdt_inner_data.push(pci_device); + } + + let pci_device_methods = PciDevSlotMethods {}; + pci_dsdt_inner_data.push(&pci_device_methods); + + // Build PCI routing table, listing IRQs assigned to PCI devices. + let prt_package_list: Vec<(u32, u32)> = self + .pci_irq_slots + .iter() + .enumerate() + .map(|(i, irq)| { + ( + ((((u32::try_from(i).unwrap()) & 0x1fu32) << 16) | 0xffffu32), + *irq as u32, + ) + }) + .collect(); + let prt_package_list: Vec = prt_package_list + .iter() + .map(|(bdf, irq)| aml::Package::new(vec![bdf, &0u8, &0u8, irq])) + .collect(); + let prt_package_list: Vec<&dyn Aml> = prt_package_list + .iter() + .map(|item| item as &dyn Aml) + .collect(); + let prt = aml::Name::new("_PRT".try_into()?, &aml::Package::new(prt_package_list))?; + pci_dsdt_inner_data.push(&prt); + + aml::Device::new( + format!("_SB_.PC{:02X}", self.id).as_str().try_into()?, + pci_dsdt_inner_data, + ) + .append_aml_bytes(v) + } +} From a95316f2d904e25ffbad6081bfc0e5e6d7e8b0ed Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 7 May 2025 16:40:54 +0200 Subject: [PATCH 18/56] pci: add support for ACPI MCFG table So that we can declare which memory region is used by PCIe devices for MMCONFIG. Signed-off-by: Babis Chalios --- src/acpi-tables/src/lib.rs | 6 ++- src/acpi-tables/src/mcfg.rs | 77 +++++++++++++++++++++++++++++++++++++ src/vmm/src/acpi/mod.rs | 27 +++++++++++-- 3 files changed, 105 insertions(+), 5 deletions(-) create mode 100644 src/acpi-tables/src/mcfg.rs diff --git a/src/acpi-tables/src/lib.rs b/src/acpi-tables/src/lib.rs index 321328047ed..d3b7df0791e 100644 --- a/src/acpi-tables/src/lib.rs +++ b/src/acpi-tables/src/lib.rs @@ -10,6 +10,7 @@ pub mod aml; pub mod dsdt; pub mod fadt; pub mod madt; +pub mod mcfg; pub mod rsdp; pub mod xsdt; @@ -17,6 +18,7 @@ pub use aml::Aml; pub use dsdt::Dsdt; pub use fadt::Fadt; pub use madt::Madt; +pub use mcfg::Mcfg; pub use rsdp::Rsdp; pub use xsdt::Xsdt; use zerocopy::little_endian::{U32, U64}; @@ -89,7 +91,7 @@ pub struct SdtHeader { pub oem_table_id: [u8; 8], pub oem_revision: U32, pub creator_id: [u8; 4], - pub creator_revison: U32, + pub creator_revision: U32, } impl SdtHeader { @@ -110,7 +112,7 @@ impl SdtHeader { oem_table_id, oem_revision: U32::new(oem_revision), creator_id: FC_ACPI_CREATOR_ID, - creator_revison: U32::new(FC_ACPI_CREATOR_REVISION), + creator_revision: U32::new(FC_ACPI_CREATOR_REVISION), } } } diff --git a/src/acpi-tables/src/mcfg.rs b/src/acpi-tables/src/mcfg.rs new file mode 100644 index 00000000000..a5dd8b9d227 --- /dev/null +++ b/src/acpi-tables/src/mcfg.rs @@ -0,0 +1,77 @@ +// Copyright © 2019 Intel Corporation +// Copyright © 2023 Rivos, Inc. +// Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +use std::mem::size_of; + +use vm_memory::{Bytes, GuestAddress, GuestMemory}; +use zerocopy::{Immutable, IntoBytes}; + +use crate::{Result, Sdt, SdtHeader, checksum}; + +#[allow(dead_code)] +#[repr(C, packed)] +#[derive(Default, Debug, IntoBytes, Clone, Copy, Immutable)] +struct PciRangeEntry { + pub base_address: u64, + pub segment: u16, + pub start: u8, + pub end: u8, + _reserved: u32, +} + +#[allow(dead_code)] +#[repr(C, packed)] +#[derive(Clone, Copy, Debug, Default, IntoBytes, Immutable)] +pub struct Mcfg { + header: SdtHeader, + _reserved: u64, + pci_range_entry: PciRangeEntry, +} + +impl Mcfg { + pub fn new( + oem_id: [u8; 6], + oem_table_id: [u8; 8], + oem_revision: u32, + pci_mmio_config_addr: u64, + ) -> Self { + let header = SdtHeader::new( + *b"MCFG", + size_of::().try_into().unwrap(), + 1, + oem_id, + oem_table_id, + oem_revision, + ); + + let mut mcfg = Mcfg { + header, + pci_range_entry: PciRangeEntry { + base_address: pci_mmio_config_addr, + segment: 0, + start: 0, + end: 0, + ..Default::default() + }, + ..Default::default() + }; + + mcfg.header.checksum = checksum(&[mcfg.as_bytes()]); + + mcfg + } +} + +impl Sdt for Mcfg { + fn len(&self) -> usize { + self.as_bytes().len() + } + + fn write_to_guest(&mut self, mem: &M, address: GuestAddress) -> Result<()> { + mem.write_slice(self.as_bytes(), address)?; + Ok(()) + } +} diff --git a/src/vmm/src/acpi/mod.rs b/src/vmm/src/acpi/mod.rs index 65075781188..a3e471aed9e 100644 --- a/src/vmm/src/acpi/mod.rs +++ b/src/vmm/src/acpi/mod.rs @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use acpi_tables::fadt::{FADT_F_HW_REDUCED_ACPI, FADT_F_PWR_BUTTON, FADT_F_SLP_BUTTON}; -use acpi_tables::{Aml, Dsdt, Fadt, Madt, Rsdp, Sdt, Xsdt, aml}; +use acpi_tables::{Aml, Dsdt, Fadt, Madt, Mcfg, Rsdp, Sdt, Xsdt, aml}; use log::{debug, error}; use vm_allocator::AllocPolicy; @@ -10,6 +10,7 @@ use crate::Vcpu; use crate::acpi::x86_64::{ apic_addr, rsdp_addr, setup_arch_dsdt, setup_arch_fadt, setup_interrupt_controllers, }; +use crate::arch::x86_64::layout; use crate::device_manager::DeviceManager; use crate::device_manager::resources::ResourceAllocator; use crate::vstate::memory::{GuestAddress, GuestMemoryMmap}; @@ -145,16 +146,27 @@ impl AcpiTableWriter<'_> { resource_allocator: &ResourceAllocator, fadt_addr: u64, madt_addr: u64, + mcfg_addr: u64, ) -> Result { let mut xsdt = Xsdt::new( OEM_ID, *b"FCMVXSDT", OEM_REVISION, - vec![fadt_addr, madt_addr], + vec![fadt_addr, madt_addr, mcfg_addr], ); self.write_acpi_table(resource_allocator, &mut xsdt) } + /// Build the MCFG table for the guest. + fn build_mcfg( + &mut self, + resource_allocator: &ResourceAllocator, + pci_mmio_config_addr: u64, + ) -> Result { + let mut mcfg = Mcfg::new(OEM_ID, *b"FCMVMCFG", OEM_REVISION, pci_mmio_config_addr); + self.write_acpi_table(resource_allocator, &mut mcfg) + } + /// Build the RSDP pointer for the guest. /// /// This will build the RSDP pointer which points to the XSDT table and write it in guest @@ -191,7 +203,16 @@ pub(crate) fn create_acpi_tables( &device_manager.resource_allocator, vcpus.len().try_into().unwrap(), )?; - let xsdt_addr = writer.build_xsdt(&device_manager.resource_allocator, fadt_addr, madt_addr)?; + let mcfg_addr = writer.build_mcfg( + &device_manager.resource_allocator, + layout::PCI_MMCONFIG_START, + )?; + let xsdt_addr = writer.build_xsdt( + &device_manager.resource_allocator, + fadt_addr, + madt_addr, + mcfg_addr, + )?; writer.build_rsdp(xsdt_addr) } From 19e07888179de2cee656745b4b683934101e4e6e Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 14 May 2025 17:35:49 +0200 Subject: [PATCH 19/56] pci: define PCI segment in FDT Write the PCI root bridge in FDT when PCI is enabled. Signed-off-by: Babis Chalios --- src/vmm/src/arch/aarch64/fdt.rs | 63 +++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/src/vmm/src/arch/aarch64/fdt.rs b/src/vmm/src/arch/aarch64/fdt.rs index c4f05d33a30..8e67a50bd64 100644 --- a/src/vmm/src/arch/aarch64/fdt.rs +++ b/src/vmm/src/arch/aarch64/fdt.rs @@ -13,8 +13,13 @@ use vm_memory::GuestMemoryError; use super::cache_info::{CacheEntry, read_cache_config}; use super::gic::GICDevice; +use crate::arch::{ + MEM_32BIT_DEVICES_SIZE, MEM_32BIT_DEVICES_START, MEM_64BIT_DEVICES_SIZE, + MEM_64BIT_DEVICES_START, PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, +}; use crate::device_manager::DeviceManager; use crate::device_manager::mmio::MMIODeviceInfo; +use crate::device_manager::pci_mngr::PciDevices; use crate::devices::acpi::vmgenid::{VMGENID_MEM_SIZE, VmGenId}; use crate::initrd::InitrdConfig; use crate::vstate::memory::{Address, GuestMemory, GuestMemoryMmap}; @@ -90,6 +95,7 @@ pub fn create_fdt( create_psci_node(&mut fdt_writer)?; create_devices_node(&mut fdt_writer, device_manager)?; create_vmgenid_node(&mut fdt_writer, &device_manager.acpi_devices.vmgenid)?; + create_pci_nodes(&mut fdt_writer, &device_manager.pci_devices)?; // End Header node. fdt_writer.end_node(root)?; @@ -431,6 +437,63 @@ fn create_devices_node( Ok(()) } +fn create_pci_nodes(fdt: &mut FdtWriter, pci_devices: &PciDevices) -> Result<(), FdtError> { + if pci_devices.pci_segment.is_none() { + return Ok(()); + } + + // Fine to unwrap here, we just checked it's not `None`. + let segment = pci_devices.pci_segment.as_ref().unwrap(); + + let pci_node_name = format!("pci@{:x}", segment.mmio_config_address); + // Each range here is a thruple of `(PCI address, CPU address, PCI size)`. + // + // More info about the format can be found here: + // https://elinux.org/Device_Tree_Usage#PCI_Address_Translation + let ranges = [ + // 32bit addresses + 0x200_0000u32, + (MEM_32BIT_DEVICES_START >> 32) as u32, // PCI address + (MEM_32BIT_DEVICES_START & 0xffff_ffff) as u32, + (MEM_32BIT_DEVICES_START >> 32) as u32, // CPU address + (MEM_32BIT_DEVICES_START & 0xffff_ffff) as u32, + (MEM_32BIT_DEVICES_SIZE >> 32) as u32, // Range size + (MEM_32BIT_DEVICES_SIZE & 0xffff_ffff) as u32, + // 64bit addresses + 0x300_0000u32, + // PCI address + (MEM_64BIT_DEVICES_START >> 32) as u32, // PCI address + (MEM_64BIT_DEVICES_START & 0xffff_ffff) as u32, + // CPU address + (MEM_64BIT_DEVICES_START >> 32) as u32, // CPU address + (MEM_64BIT_DEVICES_START & 0xffff_ffff) as u32, + // Range size + (MEM_64BIT_DEVICES_SIZE >> 32) as u32, // Range size + ((MEM_64BIT_DEVICES_SIZE & 0xffff_ffff) >> 32) as u32, + ]; + let pci_node = fdt.begin_node(&pci_node_name)?; + + fdt.property_string("compatible", "pci-host-ecam-generic")?; + fdt.property_string("device_type", "pci")?; + fdt.property_array_u32("ranges", &ranges)?; + fdt.property_array_u32("bus-range", &[0, 0])?; + fdt.property_u32("linux,pci-domain", segment.id.into())?; + fdt.property_u32("#address-cells", 3)?; + fdt.property_u32("#size-cells", 2)?; + fdt.property_array_u64( + "reg", + &[ + segment.mmio_config_address, + PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, + ], + )?; + fdt.property_u32("#interrupt-cells", 1)?; + fdt.property_null("interrupt-map")?; + fdt.property_null("interrupt-map-mask")?; + fdt.property_null("dma-coherent")?; + Ok(fdt.end_node(pci_node)?) +} + #[cfg(test)] mod tests { use std::ffi::CString; From 7a47c5b6144d919d757bc7cdb6b204945385e4b6 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 13 May 2025 10:42:04 +0200 Subject: [PATCH 20/56] pci: make PCIe support optional Add a command line argument to enable PCIe support. By default, PCIe is disabled. The reason for making PCIe off by default is that users need to explicitly enable PCI support in their kernels. Requiring users to explicitly enable it, does not break existing deployments, i.e. users can upgrade Firecracker within their existing environments without breaking any deployment. Signed-off-by: Babis Chalios --- src/firecracker/src/api_server_adapter.rs | 3 +++ src/firecracker/src/main.rs | 13 +++++++++++++ src/vmm/src/builder.rs | 6 +++++- src/vmm/src/resources.rs | 3 +++ src/vmm/src/rpc_interface.rs | 16 +++++++--------- src/vmm/src/vmm_config/boot_source.rs | 3 +-- 6 files changed, 32 insertions(+), 12 deletions(-) diff --git a/src/firecracker/src/api_server_adapter.rs b/src/firecracker/src/api_server_adapter.rs index 173ef298265..f597a5f7db9 100644 --- a/src/firecracker/src/api_server_adapter.rs +++ b/src/firecracker/src/api_server_adapter.rs @@ -143,6 +143,7 @@ pub(crate) fn run_with_api( instance_info: InstanceInfo, process_time_reporter: ProcessTimeReporter, boot_timer_enabled: bool, + pci_enabled: bool, api_payload_limit: usize, mmds_size_limit: usize, metadata_json: Option<&str>, @@ -212,6 +213,7 @@ pub(crate) fn run_with_api( json, instance_info, boot_timer_enabled, + pci_enabled, mmds_size_limit, metadata_json, ) @@ -224,6 +226,7 @@ pub(crate) fn run_with_api( &to_api, &api_event_fd, boot_timer_enabled, + pci_enabled, mmds_size_limit, metadata_json, ) diff --git a/src/firecracker/src/main.rs b/src/firecracker/src/main.rs index 4d6536d054c..3e6ad35d6a9 100644 --- a/src/firecracker/src/main.rs +++ b/src/firecracker/src/main.rs @@ -260,6 +260,11 @@ fn main_exec() -> Result<(), MainError> { Argument::new("mmds-size-limit") .takes_value(true) .help("Mmds data store limit, in bytes."), + ) + .arg( + Argument::new("enable-pci") + .takes_value(false) + .help("Enables PCIe support."), ); arg_parser.parse_from_cmdline()?; @@ -369,6 +374,7 @@ fn main_exec() -> Result<(), MainError> { .map(|x| x.expect("Unable to open or read from the mmds content file")); let boot_timer_enabled = arguments.flag_present("boot-timer"); + let pci_enabled = arguments.flag_present("enable-pci"); let api_enabled = !arguments.flag_present("no-api"); let api_payload_limit = arg_parser .arguments() @@ -422,6 +428,7 @@ fn main_exec() -> Result<(), MainError> { instance_info, process_time_reporter, boot_timer_enabled, + pci_enabled, api_payload_limit, mmds_size_limit, metadata_json.as_deref(), @@ -437,6 +444,7 @@ fn main_exec() -> Result<(), MainError> { vmm_config_json, instance_info, boot_timer_enabled, + pci_enabled, mmds_size_limit, metadata_json.as_deref(), ) @@ -554,12 +562,14 @@ pub enum BuildFromJsonError { } // Configure and start a microVM as described by the command-line JSON. +#[allow(clippy::too_many_arguments)] fn build_microvm_from_json( seccomp_filters: &BpfThreadMap, event_manager: &mut EventManager, config_json: String, instance_info: InstanceInfo, boot_timer_enabled: bool, + pci_enabled: bool, mmds_size_limit: usize, metadata_json: Option<&str>, ) -> Result<(VmResources, Arc>), BuildFromJsonError> { @@ -567,6 +577,7 @@ fn build_microvm_from_json( VmResources::from_json(&config_json, &instance_info, mmds_size_limit, metadata_json) .map_err(BuildFromJsonError::ParseFromJson)?; vm_resources.boot_timer = boot_timer_enabled; + vm_resources.pci_enabled = pci_enabled; let vmm = vmm::builder::build_and_boot_microvm( &instance_info, &vm_resources, @@ -593,6 +604,7 @@ fn run_without_api( config_json: Option, instance_info: InstanceInfo, bool_timer_enabled: bool, + pci_enabled: bool, mmds_size_limit: usize, metadata_json: Option<&str>, ) -> Result<(), RunWithoutApiError> { @@ -610,6 +622,7 @@ fn run_without_api( config_json.unwrap(), instance_info, bool_timer_enabled, + pci_enabled, mmds_size_limit, metadata_json, ) diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 57491042968..2c037fc529f 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -217,7 +217,11 @@ pub fn build_microvm_for_boot( .map(|vcpu| vcpu.copy_kvm_vcpu_fd(vmm.vm())) .collect::, _>>()?; - vmm.device_manager.enable_pci()?; + if vm_resources.pci_enabled { + vmm.device_manager.enable_pci()?; + } else { + boot_cmdline.insert("pci", "off")?; + } // The boot timer device needs to be the first device attached in order // to maintain the same MMIO address referenced in the documentation diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index a57df4341da..365355dfc2d 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -114,6 +114,8 @@ pub struct VmResources { pub mmds_size_limit: usize, /// Whether or not to load boot timer device. pub boot_timer: bool, + /// Whether or not to use PCIe transport for VirtIO devices. + pub pci_enabled: bool, } impl VmResources { @@ -613,6 +615,7 @@ mod tests { boot_timer: false, mmds_size_limit: HTTP_MAX_PAYLOAD_SIZE, entropy: Default::default(), + pci_enabled: false, } } diff --git a/src/vmm/src/rpc_interface.rs b/src/vmm/src/rpc_interface.rs index d868c022dd2..e79468ffb91 100644 --- a/src/vmm/src/rpc_interface.rs +++ b/src/vmm/src/rpc_interface.rs @@ -327,18 +327,16 @@ impl<'a> PrebootApiController<'a> { to_api: &std::sync::mpsc::Sender, api_event_fd: &vmm_sys_util::eventfd::EventFd, boot_timer_enabled: bool, + pci_enabled: bool, mmds_size_limit: usize, metadata_json: Option<&str>, ) -> Result<(VmResources, Arc>), BuildMicrovmFromRequestsError> { - let mut vm_resources = VmResources::default(); - // Silence false clippy warning. Clippy suggests using - // VmResources { boot_timer: boot_timer_enabled, ..Default::default() }; but this will - // generate build errors because VmResources contains private fields. - #[allow(clippy::field_reassign_with_default)] - { - vm_resources.mmds_size_limit = mmds_size_limit; - vm_resources.boot_timer = boot_timer_enabled; - } + let mut vm_resources = VmResources { + boot_timer: boot_timer_enabled, + mmds_size_limit, + pci_enabled, + ..Default::default() + }; // Init the data store from file, if present. if let Some(data) = metadata_json { diff --git a/src/vmm/src/vmm_config/boot_source.rs b/src/vmm/src/vmm_config/boot_source.rs index 37ba08be449..297f8abff04 100644 --- a/src/vmm/src/vmm_config/boot_source.rs +++ b/src/vmm/src/vmm_config/boot_source.rs @@ -9,14 +9,13 @@ use serde::{Deserialize, Serialize}; /// Default guest kernel command line: /// - `reboot=k` shut down the guest on reboot, instead of well... rebooting; /// - `panic=1` on panic, reboot after 1 second; -/// - `pci=off` do not scan for PCI devices (save boot time); /// - `nomodule` disable loadable kernel module support; /// - `8250.nr_uarts=0` disable 8250 serial interface; /// - `i8042.noaux` do not probe the i8042 controller for an attached mouse (save boot time); /// - `i8042.nomux` do not probe i8042 for a multiplexing controller (save boot time); /// - `i8042.dumbkbd` do not attempt to control kbd state via the i8042 (save boot time). pub const DEFAULT_KERNEL_CMDLINE: &str = - "reboot=k panic=1 pci=off nomodule 8250.nr_uarts=0 i8042.noaux i8042.nomux i8042.dumbkbd"; + "reboot=k panic=1 nomodule 8250.nr_uarts=0 i8042.noaux i8042.nomux i8042.dumbkbd"; /// Strongly typed data structure used to configure the boot source of the /// microvm. From 0d3975c603d5190b55a9771288d184deadf0e832 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 13 May 2025 12:37:56 +0200 Subject: [PATCH 21/56] pci: add support for snapshotting PCI devices At the moment, the logic just restores the device manager and add the PCIe root complex if PCI is enabled. Signed-off-by: Babis Chalios --- src/vmm/src/device_manager/mod.rs | 9 +++++++++ src/vmm/src/device_manager/pci_mngr.rs | 24 ++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index 5c01a195fc5..2922060bb13 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -265,6 +265,8 @@ pub struct DevicesState { pub mmio_state: persist::DeviceStates, /// ACPI devices state pub acpi_state: persist::ACPIDeviceManagerState, + /// PCI devices state + pub pci_state: pci_mngr::PciDevicesState, } #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -273,6 +275,8 @@ pub enum DevicePersistError { MmioRestore(#[from] persist::DevicePersistError), /// Error restoring ACPI devices: {0} AcpiRestore(#[from] persist::ACPIDeviceManagerRestoreError), + /// Error restoring PCI devices: {0} + PciRestore(#[from] PciManagerError), /// Error notifying VMGenID device: {0} VmGenidUpdate(#[from] std::io::Error), /// Error resetting serial console: {0} @@ -295,6 +299,7 @@ impl DeviceManager { DevicesState { mmio_state: self.mmio_devices.save(), acpi_state: self.acpi_devices.save(), + pci_state: self.pci_devices.save(), } } @@ -366,6 +371,10 @@ impl DeviceManager { self.acpi_devices = ACPIDeviceManager::restore(acpi_ctor_args, &state.acpi_state)?; self.acpi_devices.notify_vmgenid()?; + // Restore PCI devices + self.pci_devices + .restore(&state.pci_state, &self.resource_allocator)?; + Ok(()) } } diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs index c3bf2ada977..e9ada60cc1f 100644 --- a/src/vmm/src/device_manager/pci_mngr.rs +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -3,6 +3,7 @@ use std::sync::Arc; +use serde::{Deserialize, Serialize}; use vm_device::BusError; use super::resources::ResourceAllocator; @@ -42,4 +43,27 @@ impl PciDevices { Ok(()) } + + pub fn save(&self) -> PciDevicesState { + PciDevicesState { + pci_enabled: self.pci_segment.is_some(), + } + } + + pub fn restore( + &mut self, + state: &PciDevicesState, + resource_allocator: &Arc, + ) -> Result<(), PciManagerError> { + if state.pci_enabled { + self.attach_pci_segment(resource_allocator)?; + } + + Ok(()) + } +} + +#[derive(Default, Debug, Clone, Serialize, Deserialize)] +pub struct PciDevicesState { + pci_enabled: bool, } From 4f0776745fdf30ba4e9af31a2b67cdddef09159c Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Mon, 12 May 2025 10:46:16 +0200 Subject: [PATCH 22/56] pci: add tests for PCIe root bus Add an integration test that checks that `lspci` correctly locates the PCIe root complex if PCI is enabled for the microVM. Also, add a negative test that checks that PCIe root complex doesn't exist when PCI is not enabled. Also, extend coverage of, at least some of, the tests to ensure that they run with and without PCI configuration enabled. Do that by extending the `uvm_any*` fixtures to yield both variants. Signed-off-by: Babis Chalios --- tests/conftest.py | 79 +++++++++++++++++-- tests/framework/microvm.py | 4 + .../integration_tests/functional/test_net.py | 4 +- .../integration_tests/functional/test_pci.py | 28 +++++++ .../integration_tests/functional/test_rng.py | 18 +++-- .../security/test_vulnerabilities.py | 8 +- 6 files changed, 126 insertions(+), 15 deletions(-) create mode 100644 tests/integration_tests/functional/test_pci.py diff --git a/tests/conftest.py b/tests/conftest.py index 0c71a212b56..aad4016b554 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -567,12 +567,24 @@ def mem_size_mib(): return 256 +@pytest.fixture(params=[True, False]) +def pci_enabled(request): + """Fixture that allows configuring whether a microVM will have PCI enabled or not""" + yield request.param + + def uvm_booted( - microvm_factory, guest_kernel, rootfs, cpu_template, vcpu_count=2, mem_size_mib=256 + microvm_factory, + guest_kernel, + rootfs, + cpu_template, + pci_enabled, + vcpu_count=2, + mem_size_mib=256, ): """Return a booted uvm""" uvm = microvm_factory.build(guest_kernel, rootfs) - uvm.spawn() + uvm.spawn(pci=pci_enabled) uvm.basic_config(vcpu_count=vcpu_count, mem_size_mib=mem_size_mib) uvm.set_cpu_template(cpu_template) uvm.add_net_iface() @@ -580,9 +592,13 @@ def uvm_booted( return uvm -def uvm_restored(microvm_factory, guest_kernel, rootfs, cpu_template, **kwargs): +def uvm_restored( + microvm_factory, guest_kernel, rootfs, cpu_template, pci_enabled, **kwargs +): """Return a restored uvm""" - uvm = uvm_booted(microvm_factory, guest_kernel, rootfs, cpu_template, **kwargs) + uvm = uvm_booted( + microvm_factory, guest_kernel, rootfs, cpu_template, pci_enabled, **kwargs + ) snapshot = uvm.snapshot_full() uvm.kill() uvm2 = microvm_factory.build_from_snapshot(snapshot) @@ -603,6 +619,7 @@ def uvm_any( guest_kernel, rootfs, cpu_template_any, + pci_enabled, vcpu_count, mem_size_mib, ): @@ -612,6 +629,7 @@ def uvm_any( guest_kernel, rootfs, cpu_template_any, + pci_enabled, vcpu_count=vcpu_count, mem_size_mib=mem_size_mib, ) @@ -619,7 +637,13 @@ def uvm_any( @pytest.fixture def uvm_any_booted( - microvm_factory, guest_kernel, rootfs, cpu_template_any, vcpu_count, mem_size_mib + microvm_factory, + guest_kernel, + rootfs, + cpu_template_any, + pci_enabled, + vcpu_count, + mem_size_mib, ): """Return booted uvms""" return uvm_booted( @@ -627,6 +651,51 @@ def uvm_any_booted( guest_kernel, rootfs, cpu_template_any, + pci_enabled, + vcpu_count=vcpu_count, + mem_size_mib=mem_size_mib, + ) + + +@pytest.fixture +def uvm_any_with_pci( + uvm_ctor, + microvm_factory, + guest_kernel_acpi, + rootfs, + cpu_template_any, + vcpu_count, + mem_size_mib, +): + """Return booted uvms with PCI enabled""" + return uvm_ctor( + microvm_factory, + guest_kernel_acpi, + rootfs, + cpu_template_any, + True, + vcpu_count=vcpu_count, + mem_size_mib=mem_size_mib, + ) + + +@pytest.fixture +def uvm_any_without_pci( + uvm_ctor, + microvm_factory, + guest_kernel, + rootfs, + cpu_template_any, + vcpu_count, + mem_size_mib, +): + """Return booted uvms with PCI disabled""" + return uvm_ctor( + microvm_factory, + guest_kernel, + rootfs, + cpu_template_any, + False, vcpu_count=vcpu_count, mem_size_mib=mem_size_mib, ) diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py index b6646e758f6..37a60b85d60 100644 --- a/tests/framework/microvm.py +++ b/tests/framework/microvm.py @@ -610,6 +610,7 @@ def spawn( log_show_origin=False, metrics_path="fc.ndjson", emit_metrics: bool = False, + pci: bool = False, ): """Start a microVM as a daemon or in a screen session.""" # pylint: disable=subprocess-run-check @@ -655,6 +656,9 @@ def spawn( # Checking the timings requires DEBUG level log messages self.time_api_requests = False + if pci: + self.jailer.extra_args["enable-pci"] = None + cmd = [ *self._pre_cmd, str(self.jailer_binary_path), diff --git a/tests/integration_tests/functional/test_net.py b/tests/integration_tests/functional/test_net.py index 7abf23406d5..10467affac8 100644 --- a/tests/integration_tests/functional/test_net.py +++ b/tests/integration_tests/functional/test_net.py @@ -85,9 +85,9 @@ def test_multi_queue_unsupported(uvm_plain): @pytest.fixture -def uvm_any(microvm_factory, uvm_ctor, guest_kernel, rootfs): +def uvm_any(microvm_factory, uvm_ctor, guest_kernel, rootfs, pci_enabled): """Return booted and restored uvm with no CPU templates""" - return uvm_ctor(microvm_factory, guest_kernel, rootfs, None) + return uvm_ctor(microvm_factory, guest_kernel, rootfs, None, pci_enabled) def test_tap_offload(uvm_any): diff --git a/tests/integration_tests/functional/test_pci.py b/tests/integration_tests/functional/test_pci.py new file mode 100644 index 00000000000..dc0827b1aae --- /dev/null +++ b/tests/integration_tests/functional/test_pci.py @@ -0,0 +1,28 @@ +# Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Tests for the PCI devices""" + + +def test_pci_root_present(uvm_any_with_pci): + """ + Test that a guest with PCI enabled has a PCI root device. + """ + + vm = uvm_any_with_pci + devices = vm.ssh.run("lspci").stdout.strip().split("\n") + print(devices) + assert devices[0].startswith( + "00:00.0 Host bridge: Intel Corporation Device" + ), "PCI root not found in guest" + + +def test_pci_disabled(uvm_any_without_pci): + """ + Test that a guest with PCI disabled does not have a PCI root device but still works. + """ + + vm = uvm_any_without_pci + _, stdout, _ = vm.ssh.run("lspci") + assert ( + "00:00.0 Host bridge: Intel Corporation Device" not in stdout + ), "PCI root not found in guest" diff --git a/tests/integration_tests/functional/test_rng.py b/tests/integration_tests/functional/test_rng.py index 1893230c51a..f2acf96735a 100644 --- a/tests/integration_tests/functional/test_rng.py +++ b/tests/integration_tests/functional/test_rng.py @@ -8,10 +8,12 @@ from host_tools.network import SSHConnection -def uvm_with_rng_booted(microvm_factory, guest_kernel, rootfs, rate_limiter): +def uvm_with_rng_booted( + microvm_factory, guest_kernel, rootfs, rate_limiter, pci_enabled +): """Return a booted microvm with virtio-rng configured""" uvm = microvm_factory.build(guest_kernel, rootfs) - uvm.spawn(log_level="INFO") + uvm.spawn(log_level="INFO", pci=pci_enabled) uvm.basic_config(vcpu_count=2, mem_size_mib=256) uvm.add_net_iface() uvm.api.entropy.put(rate_limiter=rate_limiter) @@ -21,9 +23,13 @@ def uvm_with_rng_booted(microvm_factory, guest_kernel, rootfs, rate_limiter): return uvm -def uvm_with_rng_restored(microvm_factory, guest_kernel, rootfs, rate_limiter): +def uvm_with_rng_restored( + microvm_factory, guest_kernel, rootfs, rate_limiter, pci_enabled +): """Return a restored uvm with virtio-rng configured""" - uvm = uvm_with_rng_booted(microvm_factory, guest_kernel, rootfs, rate_limiter) + uvm = uvm_with_rng_booted( + microvm_factory, guest_kernel, rootfs, rate_limiter, pci_enabled + ) snapshot = uvm.snapshot_full() uvm.kill() uvm2 = microvm_factory.build_from_snapshot(snapshot) @@ -44,9 +50,9 @@ def rate_limiter(request): @pytest.fixture -def uvm_any(microvm_factory, uvm_ctor, guest_kernel, rootfs, rate_limiter): +def uvm_any(microvm_factory, uvm_ctor, guest_kernel, rootfs, rate_limiter, pci_enabled): """Return booted and restored uvms""" - return uvm_ctor(microvm_factory, guest_kernel, rootfs, rate_limiter) + return uvm_ctor(microvm_factory, guest_kernel, rootfs, rate_limiter, pci_enabled) def list_rng_available(ssh_connection: SSHConnection) -> list[str]: diff --git a/tests/integration_tests/security/test_vulnerabilities.py b/tests/integration_tests/security/test_vulnerabilities.py index 0e530123255..50b0d526450 100644 --- a/tests/integration_tests/security/test_vulnerabilities.py +++ b/tests/integration_tests/security/test_vulnerabilities.py @@ -216,13 +216,17 @@ def microvm_factory_a(record_property): @pytest.fixture -def uvm_any_a(microvm_factory_a, uvm_ctor, guest_kernel, rootfs, cpu_template_any): +def uvm_any_a( + microvm_factory_a, uvm_ctor, guest_kernel, rootfs, cpu_template_any, pci_enabled +): """Return uvm with revision A firecracker Since pytest caches fixtures, this guarantees uvm_any_a will match a vm from uvm_any. See https://docs.pytest.org/en/stable/how-to/fixtures.html#fixtures-can-be-requested-more-than-once-per-test-return-values-are-cached """ - return uvm_ctor(microvm_factory_a, guest_kernel, rootfs, cpu_template_any) + return uvm_ctor( + microvm_factory_a, guest_kernel, rootfs, cpu_template_any, pci_enabled + ) def test_check_vulnerability_files_ab(request, uvm_any): From 5a97bd90ea545dc644fc9b018ef48d330f668529 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 21 May 2025 10:48:55 +0200 Subject: [PATCH 23/56] test: allow `extd_apicid` CPU feature on AMD guests PCI-enabled guest kernels enable the `extd_apicid` CPU feature for AMD CPU families after 16h. Our supported AMD families (Milan & Genoa) are both 19h. This is irrespective of whether PCI is enabled in Firecracker. Do not mark this as host-only when running with PCI enabled kernels, i.e. all kernels that support ACPI. Signed-off-by: Babis Chalios --- tests/framework/microvm.py | 2 ++ .../functional/test_cpu_features_host_vs_guest.py | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py index 37a60b85d60..fef1a68eb4b 100644 --- a/tests/framework/microvm.py +++ b/tests/framework/microvm.py @@ -198,6 +198,7 @@ def __init__( assert microvm_id is not None self._microvm_id = microvm_id + self.pci_enabled = False self.kernel_file = None self.rootfs_file = None self.ssh_key = None @@ -657,6 +658,7 @@ def spawn( self.time_api_requests = False if pci: + self.pci_enabled = True self.jailer.extra_args["enable-pci"] = None cmd = [ diff --git a/tests/integration_tests/functional/test_cpu_features_host_vs_guest.py b/tests/integration_tests/functional/test_cpu_features_host_vs_guest.py index 4b66b077839..bd0f640fe21 100644 --- a/tests/integration_tests/functional/test_cpu_features_host_vs_guest.py +++ b/tests/integration_tests/functional/test_cpu_features_host_vs_guest.py @@ -91,7 +91,6 @@ "cqm_occup_llc", "decodeassists", "extapic", - "extd_apicid", "flushbyasid", "hw_pstate", "ibs", From 4305fb7843c0c70e119db894af9cf719dab738df Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 21 May 2025 15:12:06 +0200 Subject: [PATCH 24/56] test: add Rust integration tests for PCI-enabled uVMs We have some Rust integration tests that check building and booting of microVMs works correctly. Add variants for PCI-enabled microVMs. Signed-off-by: Babis Chalios --- src/vmm/src/test_utils/mod.rs | 19 +++-- src/vmm/tests/integration_tests.rs | 109 +++++++++++++++++------------ 2 files changed, 80 insertions(+), 48 deletions(-) diff --git a/src/vmm/src/test_utils/mod.rs b/src/vmm/src/test_utils/mod.rs index d3700c98925..3a45ce1118d 100644 --- a/src/vmm/src/test_utils/mod.rs +++ b/src/vmm/src/test_utils/mod.rs @@ -69,6 +69,7 @@ pub fn create_vmm( _kernel_image: Option<&str>, is_diff: bool, boot_microvm: bool, + pci_enabled: bool, ) -> (Arc>, EventManager) { let mut event_manager = EventManager::new().unwrap(); let empty_seccomp_filters = get_empty_filters(); @@ -82,7 +83,7 @@ pub fn create_vmm( None => boot_source_cfg.into(), }; let mock_vm_res = MockVmResources::new().with_boot_source(boot_source_cfg); - let resources: VmResources = if is_diff { + let mut resources: VmResources = if is_diff { mock_vm_res .with_vm_config(MockVmConfig::new().with_dirty_page_tracking().into()) .into() @@ -90,6 +91,8 @@ pub fn create_vmm( mock_vm_res.into() }; + resources.pci_enabled = pci_enabled; + let vmm = build_microvm_for_boot( &InstanceInfo::default(), &resources, @@ -106,16 +109,24 @@ pub fn create_vmm( } pub fn default_vmm(kernel_image: Option<&str>) -> (Arc>, EventManager) { - create_vmm(kernel_image, false, true) + create_vmm(kernel_image, false, true, false) } pub fn default_vmm_no_boot(kernel_image: Option<&str>) -> (Arc>, EventManager) { - create_vmm(kernel_image, false, false) + create_vmm(kernel_image, false, false, false) +} + +pub fn default_vmm_pci_no_boot(kernel_image: Option<&str>) -> (Arc>, EventManager) { + create_vmm(kernel_image, false, false, true) } #[cfg(target_arch = "x86_64")] pub fn dirty_tracking_vmm(kernel_image: Option<&str>) -> (Arc>, EventManager) { - create_vmm(kernel_image, true, true) + create_vmm(kernel_image, true, true, false) +} + +pub fn default_vmm_pci(kernel_image: Option<&str>) -> (Arc>, EventManager) { + create_vmm(kernel_image, false, true, false) } #[allow(clippy::undocumented_unsafe_blocks)] diff --git a/src/vmm/tests/integration_tests.rs b/src/vmm/tests/integration_tests.rs index 6982bf08c5b..88738599917 100644 --- a/src/vmm/tests/integration_tests.rs +++ b/src/vmm/tests/integration_tests.rs @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use std::io::{Seek, SeekFrom}; +use std::sync::{Arc, Mutex}; use std::thread; use std::time::Duration; @@ -15,7 +16,9 @@ use vmm::rpc_interface::{ use vmm::seccomp::get_empty_filters; use vmm::snapshot::Snapshot; use vmm::test_utils::mock_resources::{MockVmResources, NOISY_KERNEL_IMAGE}; -use vmm::test_utils::{create_vmm, default_vmm, default_vmm_no_boot}; +use vmm::test_utils::{ + create_vmm, default_vmm, default_vmm_no_boot, default_vmm_pci, default_vmm_pci_no_boot, +}; use vmm::vmm_config::balloon::BalloonDeviceConfig; use vmm::vmm_config::boot_source::BootSourceConfig; use vmm::vmm_config::drive::BlockDeviceConfig; @@ -26,9 +29,23 @@ use vmm::vmm_config::snapshot::{ CreateSnapshotParams, LoadSnapshotParams, MemBackendConfig, MemBackendType, SnapshotType, }; use vmm::vmm_config::vsock::VsockDeviceConfig; -use vmm::{DumpCpuConfigError, EventManager, FcExitCode}; +use vmm::{DumpCpuConfigError, EventManager, FcExitCode, Vmm}; use vmm_sys_util::tempfile::TempFile; +fn check_booted_microvm(vmm: Arc>, mut evmgr: EventManager) { + // On x86_64, the vmm should exit once its workload completes and signals the exit event. + // On aarch64, the test kernel doesn't exit, so the vmm is force-stopped. + #[cfg(target_arch = "x86_64")] + evmgr.run_with_timeout(500).unwrap(); + #[cfg(target_arch = "aarch64")] + vmm.lock().unwrap().stop(FcExitCode::Ok); + + assert_eq!( + vmm.lock().unwrap().shutdown_exit_code(), + Some(FcExitCode::Ok) + ); +} + #[test] fn test_build_and_boot_microvm() { // Error case: no boot source configured. @@ -47,25 +64,16 @@ fn test_build_and_boot_microvm() { } // Success case. - let (vmm, mut _evmgr) = default_vmm(None); + let (vmm, evmgr) = default_vmm(None); + check_booted_microvm(vmm, evmgr); - // On x86_64, the vmm should exit once its workload completes and signals the exit event. - // On aarch64, the test kernel doesn't exit, so the vmm is force-stopped. - #[cfg(target_arch = "x86_64")] - _evmgr.run_with_timeout(500).unwrap(); - #[cfg(target_arch = "aarch64")] - vmm.lock().unwrap().stop(FcExitCode::Ok); - - assert_eq!( - vmm.lock().unwrap().shutdown_exit_code(), - Some(FcExitCode::Ok) - ); + // microVM with PCI + let (vmm, evmgr) = default_vmm_pci(None); + check_booted_microvm(vmm, evmgr); } -#[test] -fn test_build_microvm() { +fn check_build_microvm(vmm: Arc>, mut evmgr: EventManager) { // The built microVM should be in the `VmState::Paused` state here. - let (vmm, mut _evtmgr) = default_vmm_no_boot(None); assert_eq!(vmm.lock().unwrap().instance_info().state, VmState::Paused); // The microVM should be able to resume and exit successfully. @@ -73,7 +81,7 @@ fn test_build_microvm() { // On aarch64, the test kernel doesn't exit, so the vmm is force-stopped. vmm.lock().unwrap().resume_vm().unwrap(); #[cfg(target_arch = "x86_64")] - _evtmgr.run_with_timeout(500).unwrap(); + evmgr.run_with_timeout(500).unwrap(); #[cfg(target_arch = "aarch64")] vmm.lock().unwrap().stop(FcExitCode::Ok); assert_eq!( @@ -83,10 +91,14 @@ fn test_build_microvm() { } #[test] -fn test_pause_resume_microvm() { - // Tests that pausing and resuming a microVM work as expected. - let (vmm, _) = default_vmm(None); +fn test_build_microvm() { + let (vmm, evtmgr) = default_vmm_no_boot(None); + check_build_microvm(vmm, evtmgr); + let (vmm, evtmgr) = default_vmm_pci_no_boot(None); + check_build_microvm(vmm, evtmgr); +} +fn pause_resume_microvm(vmm: Arc>) { let mut api_controller = RuntimeApiController::new(VmResources::default(), vmm.clone()); // There's a race between this thread and the vcpu thread, but this thread @@ -100,6 +112,17 @@ fn test_pause_resume_microvm() { vmm.lock().unwrap().stop(FcExitCode::Ok); } +#[test] +fn test_pause_resume_microvm() { + // Tests that pausing and resuming a microVM work as expected. + let (vmm, _) = default_vmm(None); + + pause_resume_microvm(vmm); + + let (vmm, _) = default_vmm_pci(None); + pause_resume_microvm(vmm); +} + #[test] fn test_dirty_bitmap_error() { // Error case: dirty tracking disabled. @@ -185,11 +208,11 @@ fn test_disallow_dump_cpu_config_without_pausing() { vmm.lock().unwrap().stop(FcExitCode::Ok); } -fn verify_create_snapshot(is_diff: bool) -> (TempFile, TempFile) { +fn verify_create_snapshot(is_diff: bool, pci_enabled: bool) -> (TempFile, TempFile) { let snapshot_file = TempFile::new().unwrap(); let memory_file = TempFile::new().unwrap(); - let (vmm, _) = create_vmm(Some(NOISY_KERNEL_IMAGE), is_diff, true); + let (vmm, _) = create_vmm(Some(NOISY_KERNEL_IMAGE), is_diff, true, pci_enabled); let resources = VmResources { machine_config: MachineConfig { mem_size_mib: 1, @@ -296,29 +319,27 @@ fn verify_load_snapshot(snapshot_file: TempFile, memory_file: TempFile) { #[test] fn test_create_and_load_snapshot() { - // Create diff snapshot. - let (snapshot_file, memory_file) = verify_create_snapshot(true); - // Create a new microVm from snapshot. This only tests code-level logic; it verifies - // that a microVM can be built with no errors from given snapshot. - // It does _not_ verify that the guest is actually restored properly. We're using - // python integration tests for that. - verify_load_snapshot(snapshot_file, memory_file); - - // Create full snapshot. - let (snapshot_file, memory_file) = verify_create_snapshot(false); - // Create a new microVm from snapshot. This only tests code-level logic; it verifies - // that a microVM can be built with no errors from given snapshot. - // It does _not_ verify that the guest is actually restored properly. We're using - // python integration tests for that. - verify_load_snapshot(snapshot_file, memory_file); + for (diff_snap, pci_enabled) in [(false, false), (false, true), (true, false), (true, true)] { + // Create snapshot. + let (snapshot_file, memory_file) = verify_create_snapshot(diff_snap, pci_enabled); + // Create a new microVm from snapshot. This only tests code-level logic; it verifies + // that a microVM can be built with no errors from given snapshot. + // It does _not_ verify that the guest is actually restored properly. We're using + // python integration tests for that. + verify_load_snapshot(snapshot_file, memory_file); + } } #[test] fn test_snapshot_load_sanity_checks() { - use vmm::persist::SnapShotStateSanityCheckError; - - let mut microvm_state = get_microvm_state_from_snapshot(); + let microvm_state = get_microvm_state_from_snapshot(false); + check_snapshot(microvm_state); + let microvm_state = get_microvm_state_from_snapshot(true); + check_snapshot(microvm_state); +} +fn check_snapshot(mut microvm_state: MicrovmState) { + use vmm::persist::SnapShotStateSanityCheckError; snapshot_state_sanity_check(µvm_state).unwrap(); // Remove memory regions. @@ -331,9 +352,9 @@ fn test_snapshot_load_sanity_checks() { ); } -fn get_microvm_state_from_snapshot() -> MicrovmState { +fn get_microvm_state_from_snapshot(pci_enabled: bool) -> MicrovmState { // Create a diff snapshot - let (snapshot_file, _) = verify_create_snapshot(true); + let (snapshot_file, _) = verify_create_snapshot(true, pci_enabled); // Deserialize the microVM state. let snapshot_file_metadata = snapshot_file.as_file().metadata().unwrap(); @@ -344,7 +365,7 @@ fn get_microvm_state_from_snapshot() -> MicrovmState { } fn verify_load_snap_disallowed_after_boot_resources(res: VmmAction, res_name: &str) { - let (snapshot_file, memory_file) = verify_create_snapshot(false); + let (snapshot_file, memory_file) = verify_create_snapshot(false, false); let mut event_manager = EventManager::new().unwrap(); let empty_seccomp_filters = get_empty_filters(); From 5b6478934676f44e32e0f5db3a06c85616ecf3e5 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 21 May 2025 17:48:18 +0200 Subject: [PATCH 25/56] test: temporarily disable security A/B tests for PCI uVMs Tests test_spectre_meltdown_checker_on_guest and test_check_vulnerability_files_ab run A/B tests between the HEAD of the target branch and the tip of a PR branch. This will currently fail, because Firecracker builds from the HEAD of the target branch know nothing about the `--enable-pci` command line flag, so launching the Firecracker binary for revision A will fail. Only run these tests for non-PCI uVMs for now. Once this commit gets merged we will re-enable and make sure that everything works as expected. Signed-off-by: Babis Chalios --- .../security/test_vulnerabilities.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/tests/integration_tests/security/test_vulnerabilities.py b/tests/integration_tests/security/test_vulnerabilities.py index 50b0d526450..0e6eae283ab 100644 --- a/tests/integration_tests/security/test_vulnerabilities.py +++ b/tests/integration_tests/security/test_vulnerabilities.py @@ -216,22 +216,18 @@ def microvm_factory_a(record_property): @pytest.fixture -def uvm_any_a( - microvm_factory_a, uvm_ctor, guest_kernel, rootfs, cpu_template_any, pci_enabled -): +def uvm_any_a(microvm_factory_a, uvm_ctor, guest_kernel, rootfs, cpu_template_any): """Return uvm with revision A firecracker Since pytest caches fixtures, this guarantees uvm_any_a will match a vm from uvm_any. See https://docs.pytest.org/en/stable/how-to/fixtures.html#fixtures-can-be-requested-more-than-once-per-test-return-values-are-cached """ - return uvm_ctor( - microvm_factory_a, guest_kernel, rootfs, cpu_template_any, pci_enabled - ) + return uvm_ctor(microvm_factory_a, guest_kernel, rootfs, cpu_template_any, False) -def test_check_vulnerability_files_ab(request, uvm_any): +def test_check_vulnerability_files_ab(request, uvm_any_without_pci): """Test vulnerability files on guests""" - res_b = check_vulnerabilities_files_on_guest(uvm_any) + res_b = check_vulnerabilities_files_on_guest(uvm_any_without_pci) if global_props.buildkite_pr: # we only get the uvm_any_a fixtures if we need it uvm_a = request.getfixturevalue("uvm_any_a") @@ -243,11 +239,11 @@ def test_check_vulnerability_files_ab(request, uvm_any): def test_spectre_meltdown_checker_on_guest( request, - uvm_any, + uvm_any_without_pci, spectre_meltdown_checker, ): """Test with the spectre / meltdown checker on any supported guest.""" - res_b = spectre_meltdown_checker.get_report_for_guest(uvm_any) + res_b = spectre_meltdown_checker.get_report_for_guest(uvm_any_without_pci) if global_props.buildkite_pr: # we only get the uvm_any_a fixtures if we need it uvm_a = request.getfixturevalue("uvm_any_a") @@ -255,5 +251,5 @@ def test_spectre_meltdown_checker_on_guest( assert res_b <= res_a else: assert res_b == spectre_meltdown_checker.expected_vulnerabilities( - uvm_any.cpu_template_name + uvm_any_without_pci.cpu_template_name ) From 37833b68cca2f4405cece5694494741b2939696d Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Wed, 13 Nov 2024 12:13:22 +0000 Subject: [PATCH 26/56] test: update ci artifacts to support PCI devices 1. build the kernel with PCI/e support. 2. fix a race condition between udev renaming the network devices and fcnet setting up the network interfaces 3. install pciutils on the image Signed-off-by: Riccardo Mancini Signed-off-by: Babis Chalios --- resources/chroot.sh | 2 +- resources/guest_configs/pcie.config | 8 ++++++++ resources/overlay/etc/systemd/system/fcnet.service | 1 + resources/rebuild.sh | 11 ++++++----- 4 files changed, 16 insertions(+), 6 deletions(-) create mode 100644 resources/guest_configs/pcie.config diff --git a/resources/chroot.sh b/resources/chroot.sh index e7177d7e2ca..82061700b4a 100755 --- a/resources/chroot.sh +++ b/resources/chroot.sh @@ -11,7 +11,7 @@ PS4='+\t ' cp -ruv $rootfs/* / -packages="udev systemd-sysv openssh-server iproute2 curl socat python3-minimal iperf3 iputils-ping fio kmod tmux hwloc-nox vim-tiny trace-cmd linuxptp strace" +packages="udev systemd-sysv openssh-server iproute2 curl socat python3-minimal iperf3 iputils-ping fio kmod tmux hwloc-nox vim-tiny trace-cmd linuxptp strace pciutils" # msr-tools is only supported on x86-64. arch=$(uname -m) diff --git a/resources/guest_configs/pcie.config b/resources/guest_configs/pcie.config new file mode 100644 index 00000000000..b7262f7ae73 --- /dev/null +++ b/resources/guest_configs/pcie.config @@ -0,0 +1,8 @@ +CONFIG_BLK_MQ_PCI=y +CONFIG_PCI=y +CONFIG_PCI_MMCONFIG=y +CONFIG_PCI_MSI=y +CONFIG_PCIEPORTBUS=y +CONFIG_VIRTIO_PCI=y +CONFIG_PCI_HOST_COMMON=y +CONFIG_PCI_HOST_GENERIC=y diff --git a/resources/overlay/etc/systemd/system/fcnet.service b/resources/overlay/etc/systemd/system/fcnet.service index 26d3af1dc20..ace1c8322e1 100644 --- a/resources/overlay/etc/systemd/system/fcnet.service +++ b/resources/overlay/etc/systemd/system/fcnet.service @@ -1,5 +1,6 @@ [Service] Type=oneshot +ExecStartPre=/usr/bin/udevadm settle ExecStart=/usr/local/bin/fcnet-setup.sh [Install] WantedBy=sshd.service diff --git a/resources/rebuild.sh b/resources/rebuild.sh index 56afd1bdbac..dabffa8c2ae 100755 --- a/resources/rebuild.sh +++ b/resources/rebuild.sh @@ -223,15 +223,16 @@ function build_al_kernels { clone_amazon_linux_repo CI_CONFIG="$PWD/guest_configs/ci.config" + PCIE_CONFIG="$PWD/guest_configs/pcie.config" if [[ "$KERNEL_VERSION" == @(all|5.10) ]]; then - build_al_kernel $PWD/guest_configs/microvm-kernel-ci-$ARCH-5.10.config "$CI_CONFIG" + build_al_kernel $PWD/guest_configs/microvm-kernel-ci-$ARCH-5.10.config "$CI_CONFIG" "$PCIE_CONFIG" fi if [[ $ARCH == "x86_64" && "$KERNEL_VERSION" == @(all|5.10-no-acpi) ]]; then - build_al_kernel $PWD/guest_configs/microvm-kernel-ci-$ARCH-5.10-no-acpi.config "$CI_CONFIG" + build_al_kernel $PWD/guest_configs/microvm-kernel-ci-$ARCH-5.10-no-acpi.config "$CI_CONFIG" "$PCIE_CONFIG" fi if [[ "$KERNEL_VERSION" == @(all|6.1) ]]; then - build_al_kernel $PWD/guest_configs/microvm-kernel-ci-$ARCH-6.1.config "$CI_CONFIG" + build_al_kernel $PWD/guest_configs/microvm-kernel-ci-$ARCH-6.1.config "$CI_CONFIG" "$PCIE_CONFIG" fi # Build debug kernels @@ -240,11 +241,11 @@ function build_al_kernels { OUTPUT_DIR=$OUTPUT_DIR/debug mkdir -pv $OUTPUT_DIR if [[ "$KERNEL_VERSION" == @(all|5.10) ]]; then - build_al_kernel "$PWD/guest_configs/microvm-kernel-ci-$ARCH-5.10.config" "$CI_CONFIG" "$FTRACE_CONFIG" "$DEBUG_CONFIG" + build_al_kernel "$PWD/guest_configs/microvm-kernel-ci-$ARCH-5.10.config" "$CI_CONFIG" "$PCIE_CONFIG" "$FTRACE_CONFIG" "$DEBUG_CONFIG" vmlinux_split_debuginfo $OUTPUT_DIR/vmlinux-5.10.* fi if [[ "$KERNEL_VERSION" == @(all|6.1) ]]; then - build_al_kernel "$PWD/guest_configs/microvm-kernel-ci-$ARCH-6.1.config" "$CI_CONFIG" "$FTRACE_CONFIG" "$DEBUG_CONFIG" + build_al_kernel "$PWD/guest_configs/microvm-kernel-ci-$ARCH-6.1.config" "$CI_CONFIG" "$PCIE_CONFIG" "$FTRACE_CONFIG" "$DEBUG_CONFIG" vmlinux_split_debuginfo $OUTPUT_DIR/vmlinux-6.1.* fi } From d6477fd4e01d29bb40a40d957d86980326c81c39 Mon Sep 17 00:00:00 2001 From: Riccardo Mancini Date: Mon, 10 Mar 2025 18:30:53 +0000 Subject: [PATCH 27/56] chore(pcie): update artifacts to 1.13 I've rebuilt the CI artifacts for the new development version. Signed-off-by: Riccardo Mancini Signed-off-by: Babis Chalios --- tools/devtool | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/devtool b/tools/devtool index 9d12c58fd60..c8767e35559 100755 --- a/tools/devtool +++ b/tools/devtool @@ -569,8 +569,9 @@ ensure_ci_artifacts() { # Fetch all the artifacts so they are local say "Fetching CI artifacts from S3" - FC_VERSION=$(cmd_sh "cd src/firecracker/src; cargo pkgid | cut -d# -f2 | cut -d. -f1-2") - S3_URL=s3://spec.ccfc.min/firecracker-ci/v$FC_VERSION/$(uname -m) + # FC_VERSION=$(cmd_sh "cd src/firecracker/src; cargo pkgid | cut -d# -f2 | cut -d. -f1-2") + # S3_URL=s3://spec.ccfc.min/firecracker-ci/v$FC_VERSION/$(uname -m) + S3_URL=s3://spec.ccfc.min/firecracker-ci/v1.13-pcie/$(uname -m) ARTIFACTS=$MICROVM_IMAGES_DIR/$(uname -m) if [ ! -d "$ARTIFACTS" ]; then mkdir -pv $ARTIFACTS From 00194571b16ada7d94b40ada7cef45b181e75fa4 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Thu, 22 May 2025 15:50:29 +0200 Subject: [PATCH 28/56] tests: fix MMIO gaps in memory monitor tool The memory monitor was only assuming a single MMIO gap on x86_64 when calculating the memory regions that corresponded to guest memory. Now we need to account for two MMIO gaps in x86 and one in ARM. Signed-off-by: Babis Chalios --- tests/host_tools/memory.py | 79 ++++++++++++++++++++++++++++++-------- 1 file changed, 62 insertions(+), 17 deletions(-) diff --git a/tests/host_tools/memory.py b/tests/host_tools/memory.py index 93380a9321d..eacc14ac48a 100644 --- a/tests/host_tools/memory.py +++ b/tests/host_tools/memory.py @@ -8,6 +8,8 @@ import psutil +from framework.properties import global_props + class MemoryUsageExceededError(Exception): """A custom exception containing details on excessive memory usage.""" @@ -15,8 +17,8 @@ class MemoryUsageExceededError(Exception): def __init__(self, usage, threshold, *args): """Compose the error message containing the memory consumption.""" super().__init__( - f"Memory usage ({usage / 2**20:.2f} MiB) exceeded maximum threshold " - f"({threshold / 2**20} MiB)", + f"Memory usage ({usage / 1 << 20:.2f} MiB) exceeded maximum threshold " + f"({threshold / 1 << 20} MiB)", *args, ) @@ -28,10 +30,20 @@ class MemoryMonitor(Thread): VMM memory usage. """ - # If guest memory is >3328MB, it is split in a 2nd region - X86_MEMORY_GAP_START = 3328 * 2**20 - - def __init__(self, vm, threshold=5 * 2**20, period_s=0.05): + # If guest memory is >3GiB, it is split in a 2nd region + # Gap starts at 3GiBs and is 1GiB long + X86_32BIT_MEMORY_GAP_START = 3 << 30 + X86_32BIT_MEMORY_GAP_SIZE = 1 << 30 + # If guest memory is >255GiB, it is split in a 3rd region + # Gap starts at 256 GiB and is 256GiB long + X86_64BIT_MEMORY_GAP_START = 256 << 30 + # On ARM64 we just have a single gap, but memory starts at an offset + # Gap starts at 256 GiB and is GiB long + # Memory starts at 2GiB + ARM64_64BIT_MEMORY_GAP_START = 256 << 30 + ARM64_MEMORY_START = 2 << 30 + + def __init__(self, vm, threshold=5 << 20, period_s=0.01): """Initialize monitor attributes.""" Thread.__init__(self) self._vm = vm @@ -72,7 +84,9 @@ def run(self): mem_total = 0 for mmap in mmaps: if self.is_guest_mem(mmap.size, guest_mem_bytes): + print(f"Region {mmap} is guest memory") continue + mem_total += mmap.rss self._current_rss = mem_total if mem_total > self.threshold: @@ -81,24 +95,55 @@ def run(self): time.sleep(self._period_s) - def is_guest_mem(self, size, guest_mem_bytes): + def is_guest_mem_x86(self, size, guest_mem_bytes): """ - If the address is recognised as a guest memory region, - return True, otherwise return False. + Checks if a region is a guest memory region based on + x86_64 physical memory layout """ + return size in ( + # memory fits before the first gap + guest_mem_bytes, + # guest memory spans at least two regions & memory fits before the second gap + self.X86_32BIT_MEMORY_GAP_START, + # guest memory spans exactly two regions + guest_mem_bytes - self.X86_32BIT_MEMORY_GAP_START, + # guest memory fills the space between the two gaps + self.X86_64BIT_MEMORY_GAP_START + - self.X86_32BIT_MEMORY_GAP_START + - self.X86_32BIT_MEMORY_GAP_SIZE, + # guest memory spans 3 regions, this is what remains past the second gap + guest_mem_bytes + - self.X86_64BIT_MEMORY_GAP_START + + self.X86_32BIT_MEMORY_GAP_SIZE, + ) - # If x86_64 guest memory exceeds 3328M, it will be split - # in 2 regions: 3328M and the rest. We have 3 cases here - # to recognise a guest memory region: - # - its size matches the guest memory exactly - # - its size is 3328M - # - its size is guest memory minus 3328M. + def is_guest_mem_arch64(self, size, guest_mem_bytes): + """ + Checks if a region is a guest memory region based on + ARM64 physical memory layout + """ return size in ( + # guest memory fits before the gap guest_mem_bytes, - self.X86_MEMORY_GAP_START, - guest_mem_bytes - self.X86_MEMORY_GAP_START, + # guest memory fills the space before the gap + self.ARM64_64BIT_MEMORY_GAP_START - self.ARM64_MEMORY_START, + # guest memory spans 2 regions, this is what remains past the gap + guest_mem_bytes + - self.ARM64_64BIT_MEMORY_GAP_START + + self.ARM64_MEMORY_START, ) + def is_guest_mem(self, size, guest_mem_bytes): + """ + If the address is recognised as a guest memory region, + return True, otherwise return False. + """ + + if global_props.cpu_architecture == "x86_64": + return self.is_guest_mem_x86(size, guest_mem_bytes) + + return self.is_guest_mem_arch64(size, guest_mem_bytes) + def check_samples(self): """Check that there are no samples over the threshold.""" if self._exceeded is not None: From b4a2de1e0d5de85c18cfd0689c792160bcdbd7ee Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 4 Jun 2025 12:33:10 +0200 Subject: [PATCH 29/56] fix: boottimer device MMIO address When we re-arranged the MMIO address space in commit 9a165d17f1ba (arch: define 64-bit capable MMIO memory regions) we moved the MMIO region of the boot timer device for x86 systems, but we didn't update the init scripts that hardcode it and use it to report boot time timestamp back to Firecracker. Update the init.c and initramfs values for the region. Also, add a functional test that runs during CI PR tests and makes sure the boot timer works. Signed-off-by: Babis Chalios --- resources/overlay/usr/local/bin/init.c | 2 +- resources/rebuild.sh | 2 +- .../performance/test_boottime.py | 42 +++++++++++++------ 3 files changed, 31 insertions(+), 15 deletions(-) diff --git a/resources/overlay/usr/local/bin/init.c b/resources/overlay/usr/local/bin/init.c index caa3e9d91d5..4d469171ae5 100644 --- a/resources/overlay/usr/local/bin/init.c +++ b/resources/overlay/usr/local/bin/init.c @@ -13,7 +13,7 @@ // Position on the bus is defined by MMIO_LEN increments, where MMIO_LEN is // defined as 0x1000 in vmm/src/device_manager/mmio.rs. #ifdef __x86_64__ -#define MAGIC_MMIO_SIGNAL_GUEST_BOOT_COMPLETE 0xd0000000 +#define MAGIC_MMIO_SIGNAL_GUEST_BOOT_COMPLETE 0xc0000000 #endif #ifdef __aarch64__ #define MAGIC_MMIO_SIGNAL_GUEST_BOOT_COMPLETE 0x40000000 diff --git a/resources/rebuild.sh b/resources/rebuild.sh index dabffa8c2ae..d3d4ffe052e 100755 --- a/resources/rebuild.sh +++ b/resources/rebuild.sh @@ -92,7 +92,7 @@ function build_initramfs { # Report guest boot time back to Firecracker via MMIO # See arch/src/lib.rs and the BootTimer device - MAGIC_BOOT_ADDRESS=0xd0000000 + MAGIC_BOOT_ADDRESS=0xc0000000 if [ $ARCH = "aarch64" ]; then MAGIC_BOOT_ADDRESS=0x40000000 fi diff --git a/tests/integration_tests/performance/test_boottime.py b/tests/integration_tests/performance/test_boottime.py index ad0822b0436..7708451ec7f 100644 --- a/tests/integration_tests/performance/test_boottime.py +++ b/tests/integration_tests/performance/test_boottime.py @@ -94,6 +94,33 @@ def to_ms(v, unit): return kernel, userspace, total +def launch_vm_with_boot_timer( + microvm_factory, guest_kernel_acpi, rootfs_rw, vcpu_count, mem_size_mib +): + """Launches a microVM with guest-timer and returns the reported metrics for it""" + vm = microvm_factory.build(guest_kernel_acpi, rootfs_rw) + vm.jailer.extra_args.update({"boot-timer": None}) + vm.spawn() + vm.basic_config( + vcpu_count=vcpu_count, + mem_size_mib=mem_size_mib, + boot_args=DEFAULT_BOOT_ARGS + " init=/usr/local/bin/init", + enable_entropy_device=True, + ) + vm.add_net_iface() + vm.start() + vm.pin_threads(0) + + boot_time_us, cpu_boot_time_us = get_boottime_device_info(vm) + + return (vm, boot_time_us, cpu_boot_time_us) + + +def test_boot_timer(microvm_factory, guest_kernel_acpi, rootfs): + """Tests that the boot timer device works""" + launch_vm_with_boot_timer(microvm_factory, guest_kernel_acpi, rootfs, 1, 128) + + @pytest.mark.parametrize( "vcpu_count,mem_size_mib", [(1, 128), (1, 1024), (2, 2048), (4, 4096)], @@ -105,20 +132,9 @@ def test_boottime( """Test boot time with different guest configurations""" for i in range(10): - vm = microvm_factory.build(guest_kernel_acpi, rootfs_rw) - vm.jailer.extra_args.update({"boot-timer": None}) - vm.spawn() - vm.basic_config( - vcpu_count=vcpu_count, - mem_size_mib=mem_size_mib, - boot_args=DEFAULT_BOOT_ARGS + " init=/usr/local/bin/init", - enable_entropy_device=True, + vm, boot_time_us, cpu_boot_time_us = launch_vm_with_boot_timer( + microvm_factory, guest_kernel_acpi, rootfs_rw, vcpu_count, mem_size_mib ) - vm.add_net_iface() - vm.start() - vm.pin_threads(0) - - boot_time_us, cpu_boot_time_us = get_boottime_device_info(vm) if i == 0: metrics.set_dimensions( From d235beb019f58c02a77e959cfdde82e572c5cb74 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 4 Jun 2025 16:13:32 +0200 Subject: [PATCH 30/56] test: remove logging from memory monitor Commit be5a600e (tests: fix MMIO gaps in memory monitor tool) that fixed the memory monitor to account for the 64-bit MMIO region included a left-over debug print. Remove it. Signed-off-by: Babis Chalios --- tests/host_tools/memory.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/host_tools/memory.py b/tests/host_tools/memory.py index eacc14ac48a..d9c2a01fe06 100644 --- a/tests/host_tools/memory.py +++ b/tests/host_tools/memory.py @@ -84,7 +84,6 @@ def run(self): mem_total = 0 for mmap in mmaps: if self.is_guest_mem(mmap.size, guest_mem_bytes): - print(f"Region {mmap} is guest memory") continue mem_total += mmap.rss From 112b8bddf8da8aad594936ba187ca4292f6c08d6 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Fri, 30 May 2025 18:14:21 +0200 Subject: [PATCH 31/56] chore: update kvm and vmm-sys-util dependencies We need the new KvmIrqRouting FamStruct wrapper from kvm-bindings, which though forces us to update vmm-sys-util to 0.14.0 and also bump all downstream dependencies of vmm-sys-util to use that version. Signed-off-by: Babis Chalios --- Cargo.lock | 124 ++++++++++++++++++++++++++++++------- src/firecracker/Cargo.toml | 5 +- 2 files changed, 104 insertions(+), 25 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 33c5b3e88a6..50cef2c09df 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -98,7 +98,7 @@ version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6c8bdeb6047d8983be085bab0ba1472e6dc604e7041dbf6fcd5e71523014fae9" dependencies = [ - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -109,7 +109,7 @@ checksum = "403f75924867bb1033c59fbf0797484329750cfbe3c4325cd33127941fabc882" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -218,7 +218,7 @@ dependencies = [ "bitflags 2.9.1", "cexpr", "clang-sys", - "itertools 0.10.5", + "itertools 0.12.1", "lazy_static", "lazycell", "log", @@ -256,12 +256,6 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" -[[package]] -name = "byteorder" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" - [[package]] name = "cargo_toml" version = "0.22.1" @@ -601,7 +595,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "778e2ac28f6c47af28e4907f13ffd1e1ddbd400980a9abd7c8df189bf578a5ad" dependencies = [ "libc", - "windows-sys", + "windows-sys 0.60.2", ] [[package]] @@ -733,7 +727,7 @@ version = "0.5.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589533453244b0995c858700322199b2becb13b627df2851f64a2775d024abcf" dependencies = [ - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -770,6 +764,15 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + [[package]] name = "itertools" version = "0.13.0" @@ -898,7 +901,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667" dependencies = [ "cfg-if", - "windows-targets", + "windows-targets 0.53.2", ] [[package]] @@ -967,7 +970,7 @@ dependencies = [ [[package]] name = "micro_http" version = "0.1.0" -source = "git+https://github.com/firecracker-microvm/micro-http#bf5098916006912f8dd35aaa6daa5579c6c297b2" +source = "git+https://github.com/firecracker-microvm/micro-http#11cc5da16ac86f9107d3f45791944fa6b964a6a9" dependencies = [ "libc", "vmm-sys-util", @@ -1240,7 +1243,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -1816,7 +1819,7 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" dependencies = [ - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -1831,7 +1834,16 @@ version = "0.59.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" dependencies = [ - "windows-targets", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" +dependencies = [ + "windows-targets 0.53.2", ] [[package]] @@ -1840,14 +1852,30 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_gnullvm", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm 0.52.6", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.53.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c66f69fcc9ce11da9966ddb31a40968cad001c5bedeb5c2b82ede4253ab48aef" +dependencies = [ + "windows_aarch64_gnullvm 0.53.0", + "windows_aarch64_msvc 0.53.0", + "windows_i686_gnu 0.53.0", + "windows_i686_gnullvm 0.53.0", + "windows_i686_msvc 0.53.0", + "windows_x86_64_gnu 0.53.0", + "windows_x86_64_gnullvm 0.53.0", + "windows_x86_64_msvc 0.53.0", ] [[package]] @@ -1856,48 +1884,96 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764" + [[package]] name = "windows_aarch64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c" + [[package]] name = "windows_i686_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" +[[package]] +name = "windows_i686_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3" + [[package]] name = "windows_i686_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11" + [[package]] name = "windows_i686_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" +[[package]] +name = "windows_i686_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d" + [[package]] name = "windows_x86_64_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba" + [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57" + [[package]] name = "windows_x86_64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" + [[package]] name = "winnow" version = "0.7.11" diff --git a/src/firecracker/Cargo.toml b/src/firecracker/Cargo.toml index bcd3a032cfc..ce971b56122 100644 --- a/src/firecracker/Cargo.toml +++ b/src/firecracker/Cargo.toml @@ -42,7 +42,10 @@ serde_json = "1.0.140" [dev-dependencies] cargo_toml = "0.22.1" libc = "0.2.174" -regex = { version = "1.11.1", default-features = false, features = ["std", "unicode-perl"] } +regex = { version = "1.11.1", default-features = false, features = [ + "std", + "unicode-perl", +] } # Dev-Dependencies for uffd examples serde = { version = "1.0.219", features = ["derive"] } From d11e5da298d7b7c5c222060cd0cefbf20ae9bf12 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 3 Jun 2025 18:14:33 +0200 Subject: [PATCH 32/56] pci: fixes in PCI crate Define thiserror::Error and displaydoc::Display for various error types in the vended PCI crate. This way we can embed them in our error types downstream. Also export a few types and struct fields that were private and we will be needing them. Signed-off-by: Babis Chalios --- Cargo.lock | 1 + src/pci/Cargo.toml | 1 + src/pci/src/bus.rs | 4 ++-- src/pci/src/configuration.rs | 6 +++--- src/pci/src/device.rs | 29 +++++------------------------ src/pci/src/lib.rs | 11 +++++++---- src/pci/src/msix.rs | 18 +++++++++++++++--- 7 files changed, 34 insertions(+), 36 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 50cef2c09df..3b5aa2637bd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1047,6 +1047,7 @@ name = "pci" version = "0.1.0" dependencies = [ "byteorder", + "displaydoc", "libc", "log", "serde", diff --git a/src/pci/Cargo.toml b/src/pci/Cargo.toml index c88cd270b23..3549d5010fe 100644 --- a/src/pci/Cargo.toml +++ b/src/pci/Cargo.toml @@ -13,6 +13,7 @@ default = [] [dependencies] byteorder = "1.5.0" +displaydoc = "0.2.5" libc = "0.2.172" log = "0.4.27" serde = { version = "1.0.219", features = ["derive"] } diff --git a/src/pci/src/bus.rs b/src/pci/src/bus.rs index cb42b4ee9c5..775238edff9 100644 --- a/src/pci/src/bus.rs +++ b/src/pci/src/bus.rs @@ -24,7 +24,7 @@ const DEVICE_ID_INTEL_VIRT_PCIE_HOST: u16 = 0x0d57; const NUM_DEVICE_IDS: usize = 32; /// Errors for device manager. -#[derive(Debug)] +#[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum PciRootError { /// Could not allocate device address space for the device. AllocateDeviceAddrs(PciDeviceError), @@ -103,7 +103,7 @@ impl PciDevice for PciRoot { pub struct PciBus { /// Devices attached to this bus. /// Device 0 is host bridge. - devices: HashMap>>, + pub devices: HashMap>>, device_reloc: Arc, device_ids: Vec, } diff --git a/src/pci/src/configuration.rs b/src/pci/src/configuration.rs index 3a53167148c..c37f8026fbe 100644 --- a/src/pci/src/configuration.rs +++ b/src/pci/src/configuration.rs @@ -409,7 +409,7 @@ struct PciBar { r#type: Option, } -#[derive(Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct PciConfigurationState { registers: Vec, writable_bits: Vec, @@ -466,7 +466,7 @@ impl From for PciBarType { } } -#[derive(Copy, Clone)] +#[derive(Debug, Copy, Clone, Serialize, Deserialize)] pub enum PciBarPrefetchable { NotPrefetchable = 0, Prefetchable = 0x08, @@ -481,7 +481,7 @@ impl From for bool { } } -#[derive(Copy, Clone)] +#[derive(Debug, Copy, Clone, Serialize, Deserialize)] pub struct PciBarConfiguration { addr: u64, size: u64, diff --git a/src/pci/src/device.rs b/src/pci/src/device.rs index d3bd3056a36..bf89331faa9 100644 --- a/src/pci/src/device.rs +++ b/src/pci/src/device.rs @@ -6,7 +6,6 @@ // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause use std::any::Any; -use std::fmt::{self, Display}; use std::sync::{Arc, Barrier}; use std::{io, result}; @@ -16,39 +15,21 @@ use vm_device::Resource; use crate::configuration::{self, PciBarRegionType}; use crate::PciBarConfiguration; -#[derive(Debug)] +#[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum Error { - /// Setup of the device capabilities failed. + /// Setup of the device capabilities failed: {0}. CapabilitiesSetup(configuration::Error), - /// Allocating space for an IO BAR failed. + /// Allocating space for an IO BAR failed, size={0}. IoAllocationFailed(u64), - /// Registering an IO BAR failed. + /// Registering an IO BAR at address {0} failed: {1} IoRegistrationFailed(u64, configuration::Error), /// Expected resource not found. MissingResource, - /// Invalid resource. + /// Invalid resource InvalidResource(Resource), } pub type Result = std::result::Result; -impl Display for Error { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - use self::Error::*; - - match self { - CapabilitiesSetup(e) => write!(f, "failed to add capability {e}"), - IoAllocationFailed(size) => { - write!(f, "failed to allocate space for an IO BAR, size={size}") - } - IoRegistrationFailed(addr, e) => { - write!(f, "failed to register an IO BAR, addr={addr} err={e}") - } - MissingResource => write!(f, "failed to find expected resource"), - InvalidResource(r) => write!(f, "invalid resource {r:?}"), - } - } -} - #[derive(Clone, Copy)] pub struct BarReprogrammingParams { pub old_base: u64, diff --git a/src/pci/src/lib.rs b/src/pci/src/lib.rs index 2672159e474..3162da292de 100644 --- a/src/pci/src/lib.rs +++ b/src/pci/src/lib.rs @@ -24,15 +24,18 @@ use serde::de::Visitor; pub use self::bus::{PciBus, PciConfigIo, PciConfigMmio, PciRoot, PciRootError}; pub use self::configuration::{ PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciCapability, PciCapabilityId, - PciClassCode, PciConfiguration, PciExpressCapabilityId, PciHeaderType, PciMassStorageSubclass, - PciNetworkControllerSubclass, PciProgrammingInterface, PciSerialBusSubClass, PciSubclass, - PCI_CONFIGURATION_ID, + PciClassCode, PciConfiguration, PciConfigurationState, PciExpressCapabilityId, PciHeaderType, + PciMassStorageSubclass, PciNetworkControllerSubclass, PciProgrammingInterface, + PciSerialBusSubClass, PciSubclass, PCI_CONFIGURATION_ID, }; pub use self::device::{ BarReprogrammingParams, DeviceRelocation, Error as PciDeviceError, PciDevice, }; pub use self::msi::{msi_num_enabled_vectors, MsiCap, MsiConfig}; -pub use self::msix::{MsixCap, MsixConfig, MsixTableEntry, MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE}; +pub use self::msix::{ + Error as MsixError, MsixCap, MsixConfig, MsixConfigState, MsixTableEntry, MSIX_CONFIG_ID, + MSIX_TABLE_ENTRY_SIZE, +}; /// PCI has four interrupt pins A->D. #[derive(Copy, Clone)] diff --git a/src/pci/src/msix.rs b/src/pci/src/msix.rs index 4b3cf688980..be5aa3b8cf1 100644 --- a/src/pci/src/msix.rs +++ b/src/pci/src/msix.rs @@ -26,7 +26,7 @@ const MSIX_ENABLE_MASK: u16 = (1 << MSIX_ENABLE_BIT) as u16; pub const MSIX_TABLE_ENTRY_SIZE: usize = 16; pub const MSIX_CONFIG_ID: &str = "msix_config"; -#[derive(Debug)] +#[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum Error { /// Failed enabling the interrupt route. EnableInterruptRoute(io::Error), @@ -59,7 +59,7 @@ impl Default for MsixTableEntry { } } -#[derive(Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct MsixConfigState { table_entries: Vec, pba_entries: Vec, @@ -71,11 +71,23 @@ pub struct MsixConfig { pub table_entries: Vec, pub pba_entries: Vec, pub devid: u32, - interrupt_source_group: Arc, + pub interrupt_source_group: Arc, masked: bool, enabled: bool, } +impl std::fmt::Debug for MsixConfig { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("MsixConfig") + .field("table_entries", &self.table_entries) + .field("pba_entries", &self.pba_entries) + .field("devid", &self.devid) + .field("masked", &self.masked) + .field("enabled", &self.enabled) + .finish() + } +} + impl MsixConfig { pub fn new( msix_vectors: u16, From 709e66582a42ba8b152b81a020a26fb645967c4a Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 10 Jun 2025 17:05:27 +0200 Subject: [PATCH 33/56] vm-device: return reference to EventFd from Interrupt trait Instead of returning an `EventFd` type, which will actually force us to clone the file descriptor in the Firecracker side. Signed-off-by: Babis Chalios --- src/vm-device/src/interrupt/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vm-device/src/interrupt/mod.rs b/src/vm-device/src/interrupt/mod.rs index f4aec52a2e0..da5d87a4e1a 100644 --- a/src/vm-device/src/interrupt/mod.rs +++ b/src/vm-device/src/interrupt/mod.rs @@ -172,7 +172,7 @@ pub trait InterruptSourceGroup: Send + Sync { /// to inject interrupts into a guest, by writing to the file returned /// by this method. #[allow(unused_variables)] - fn notifier(&self, index: InterruptIndex) -> Option; + fn notifier(&self, index: InterruptIndex) -> Option<&EventFd>; /// Update the interrupt source group configuration. /// From 3d143dc4482b841b68605527cc2a0ffbcfd9c02c Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 25 Jun 2025 12:57:06 +0200 Subject: [PATCH 34/56] cleanup: remove unused code from pci and vm-device crates This is code we are not going to use in Firecracker. Remove it, so we can keep the crates we vend as minimal as possible, including only things we are actually using. Signed-off-by: Babis Chalios --- src/pci/src/lib.rs | 2 - src/pci/src/msi.rs | 282 --------------------------- src/vm-device/src/dma_mapping/mod.rs | 18 -- src/vm-device/src/lib.rs | 1 - 4 files changed, 303 deletions(-) delete mode 100644 src/pci/src/msi.rs delete mode 100644 src/vm-device/src/dma_mapping/mod.rs diff --git a/src/pci/src/lib.rs b/src/pci/src/lib.rs index 3162da292de..f1dec5b126a 100644 --- a/src/pci/src/lib.rs +++ b/src/pci/src/lib.rs @@ -12,7 +12,6 @@ extern crate log; mod bus; mod configuration; mod device; -mod msi; mod msix; use std::fmt::{self, Debug, Display}; @@ -31,7 +30,6 @@ pub use self::configuration::{ pub use self::device::{ BarReprogrammingParams, DeviceRelocation, Error as PciDeviceError, PciDevice, }; -pub use self::msi::{msi_num_enabled_vectors, MsiCap, MsiConfig}; pub use self::msix::{ Error as MsixError, MsixCap, MsixConfig, MsixConfigState, MsixTableEntry, MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE, diff --git a/src/pci/src/msi.rs b/src/pci/src/msi.rs deleted file mode 100644 index 16d593cd115..00000000000 --- a/src/pci/src/msi.rs +++ /dev/null @@ -1,282 +0,0 @@ -// Copyright © 2019 Intel Corporation -// -// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause -// - -use std::io; -use std::sync::Arc; - -use byteorder::{ByteOrder, LittleEndian}; -use serde::{Deserialize, Serialize}; -use thiserror::Error; -use vm_device::interrupt::{ - InterruptIndex, InterruptSourceConfig, InterruptSourceGroup, MsiIrqSourceConfig, -}; - -// MSI control masks -const MSI_CTL_ENABLE: u16 = 0x1; -const MSI_CTL_MULTI_MSG_ENABLE: u16 = 0x70; -const MSI_CTL_64_BITS: u16 = 0x80; -const MSI_CTL_PER_VECTOR: u16 = 0x100; - -// MSI message offsets -const MSI_MSG_CTL_OFFSET: u64 = 0x2; -const MSI_MSG_ADDR_LO_OFFSET: u64 = 0x4; - -// MSI message masks -const MSI_MSG_ADDR_LO_MASK: u32 = 0xffff_fffc; - -pub fn msi_num_enabled_vectors(msg_ctl: u16) -> usize { - let field = (msg_ctl >> 4) & 0x7; - - if field > 5 { - return 0; - } - - 1 << field -} - -#[derive(Error, Debug)] -pub enum Error { - #[error("Failed enabling the interrupt route: {0}")] - EnableInterruptRoute(io::Error), - #[error("Failed updating the interrupt route: {0}")] - UpdateInterruptRoute(io::Error), -} - -#[derive(Clone, Copy, Default, Serialize, Deserialize)] -pub struct MsiCap { - // Message Control Register - // 0: MSI enable. - // 3-1; Multiple message capable. - // 6-4: Multiple message enable. - // 7: 64 bits address capable. - // 8: Per-vector masking capable. - // 15-9: Reserved. - pub msg_ctl: u16, - // Message Address (LSB) - // 1-0: Reserved. - // 31-2: Message address. - pub msg_addr_lo: u32, - // Message Upper Address (MSB) - // 31-0: Message address. - pub msg_addr_hi: u32, - // Message Data - // 15-0: Message data. - pub msg_data: u16, - // Mask Bits - // 31-0: Mask bits. - pub mask_bits: u32, - // Pending Bits - // 31-0: Pending bits. - pub pending_bits: u32, -} - -impl MsiCap { - fn addr_64_bits(&self) -> bool { - self.msg_ctl & MSI_CTL_64_BITS == MSI_CTL_64_BITS - } - - fn per_vector_mask(&self) -> bool { - self.msg_ctl & MSI_CTL_PER_VECTOR == MSI_CTL_PER_VECTOR - } - - fn enabled(&self) -> bool { - self.msg_ctl & MSI_CTL_ENABLE == MSI_CTL_ENABLE - } - - fn num_enabled_vectors(&self) -> usize { - msi_num_enabled_vectors(self.msg_ctl) - } - - fn vector_masked(&self, vector: usize) -> bool { - if !self.per_vector_mask() { - return false; - } - - (self.mask_bits >> vector) & 0x1 == 0x1 - } - - fn size(&self) -> u64 { - let mut size: u64 = 0xa; - - if self.addr_64_bits() { - size += 0x4; - } - if self.per_vector_mask() { - size += 0xa; - } - - size - } - - fn update(&mut self, offset: u64, data: &[u8]) { - // Calculate message data offset depending on the address being 32 or - // 64 bits. - // Calculate upper address offset if the address is 64 bits. - // Calculate mask bits offset based on the address being 32 or 64 bits - // and based on the per vector masking being enabled or not. - let (msg_data_offset, addr_hi_offset, mask_bits_offset): (u64, Option, Option) = - if self.addr_64_bits() { - let mask_bits = if self.per_vector_mask() { - Some(0x10) - } else { - None - }; - (0xc, Some(0x8), mask_bits) - } else { - let mask_bits = if self.per_vector_mask() { - Some(0xc) - } else { - None - }; - (0x8, None, mask_bits) - }; - - // Update cache without overriding the read-only bits. - match data.len() { - 2 => { - let value = LittleEndian::read_u16(data); - match offset { - MSI_MSG_CTL_OFFSET => { - self.msg_ctl = (self.msg_ctl & !(MSI_CTL_ENABLE | MSI_CTL_MULTI_MSG_ENABLE)) - | (value & (MSI_CTL_ENABLE | MSI_CTL_MULTI_MSG_ENABLE)) - } - x if x == msg_data_offset => self.msg_data = value, - _ => error!("invalid offset"), - } - } - 4 => { - let value = LittleEndian::read_u32(data); - match offset { - 0x0 => { - self.msg_ctl = (self.msg_ctl & !(MSI_CTL_ENABLE | MSI_CTL_MULTI_MSG_ENABLE)) - | ((value >> 16) as u16 & (MSI_CTL_ENABLE | MSI_CTL_MULTI_MSG_ENABLE)) - } - MSI_MSG_ADDR_LO_OFFSET => self.msg_addr_lo = value & MSI_MSG_ADDR_LO_MASK, - x if x == msg_data_offset => self.msg_data = value as u16, - x if addr_hi_offset.is_some() && x == addr_hi_offset.unwrap() => { - self.msg_addr_hi = value - } - x if mask_bits_offset.is_some() && x == mask_bits_offset.unwrap() => { - self.mask_bits = value - } - _ => error!("invalid offset"), - } - } - _ => error!("invalid data length"), - } - } -} - -#[derive(Serialize, Deserialize)] -pub struct MsiConfigState { - cap: MsiCap, -} - -pub struct MsiConfig { - pub cap: MsiCap, - interrupt_source_group: Arc, -} - -impl MsiConfig { - pub fn new( - msg_ctl: u16, - interrupt_source_group: Arc, - state: Option, - ) -> Result { - let cap = if let Some(state) = state { - if state.cap.enabled() { - for idx in 0..state.cap.num_enabled_vectors() { - let config = MsiIrqSourceConfig { - high_addr: state.cap.msg_addr_hi, - low_addr: state.cap.msg_addr_lo, - data: state.cap.msg_data as u32, - devid: 0, - }; - - interrupt_source_group - .update( - idx as InterruptIndex, - InterruptSourceConfig::MsiIrq(config), - state.cap.vector_masked(idx), - false, - ) - .map_err(Error::UpdateInterruptRoute)?; - } - - interrupt_source_group - .set_gsi() - .map_err(Error::EnableInterruptRoute)?; - - interrupt_source_group - .enable() - .map_err(Error::EnableInterruptRoute)?; - } - - state.cap - } else { - MsiCap { - msg_ctl, - ..Default::default() - } - }; - - Ok(MsiConfig { - cap, - interrupt_source_group, - }) - } - - pub fn state(&self) -> MsiConfigState { - MsiConfigState { cap: self.cap } - } - - pub fn enabled(&self) -> bool { - self.cap.enabled() - } - - pub fn size(&self) -> u64 { - self.cap.size() - } - - pub fn num_enabled_vectors(&self) -> usize { - self.cap.num_enabled_vectors() - } - - pub fn update(&mut self, offset: u64, data: &[u8]) { - let old_enabled = self.cap.enabled(); - - self.cap.update(offset, data); - - if self.cap.enabled() { - for idx in 0..self.num_enabled_vectors() { - let config = MsiIrqSourceConfig { - high_addr: self.cap.msg_addr_hi, - low_addr: self.cap.msg_addr_lo, - data: self.cap.msg_data as u32, - devid: 0, - }; - - if let Err(e) = self.interrupt_source_group.update( - idx as InterruptIndex, - InterruptSourceConfig::MsiIrq(config), - self.cap.vector_masked(idx), - true, - ) { - error!("Failed updating vector: {:?}", e); - } - } - - if !old_enabled { - if let Err(e) = self.interrupt_source_group.enable() { - error!("Failed enabling irq_fd: {:?}", e); - } - } - } else if old_enabled { - if let Err(e) = self.interrupt_source_group.disable() { - error!("Failed disabling irq_fd: {:?}", e); - } - } - } -} diff --git a/src/vm-device/src/dma_mapping/mod.rs b/src/vm-device/src/dma_mapping/mod.rs deleted file mode 100644 index 6cba6e16488..00000000000 --- a/src/vm-device/src/dma_mapping/mod.rs +++ /dev/null @@ -1,18 +0,0 @@ -// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. -// Copyright © 2021 Intel Corporation -// -// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause - -/// Trait to trigger DMA mapping updates for devices managed by virtio-iommu -/// -/// Trait meant for triggering the DMA mapping update related to an external -/// device not managed fully through virtio. It is dedicated to virtio-iommu -/// in order to trigger the map update anytime the mapping is updated from the -/// guest. -pub trait ExternalDmaMapping: Send + Sync { - /// Map a memory range - fn map(&self, iova: u64, gpa: u64, size: u64) -> std::result::Result<(), std::io::Error>; - - /// Unmap a memory range - fn unmap(&self, iova: u64, size: u64) -> std::result::Result<(), std::io::Error>; -} diff --git a/src/vm-device/src/lib.rs b/src/vm-device/src/lib.rs index fe06fd8b465..b980b09c4b9 100644 --- a/src/vm-device/src/lib.rs +++ b/src/vm-device/src/lib.rs @@ -9,7 +9,6 @@ use serde::{Deserialize, Serialize}; mod bus; -pub mod dma_mapping; pub mod interrupt; pub use self::bus::{Bus, BusDevice, BusDeviceSync, Error as BusError}; From 1188b7996b5f482dd73bcaca4a1658892c8f1533 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Mon, 2 Jun 2025 18:46:32 +0200 Subject: [PATCH 35/56] refactor: allow storing Arc within Vmm We'd like to be able to store Vm within an atomic reference so we can pass it around and share it with other components. The main issue with doing this change is that we need Vm to be `mut` during initialization and the builder.rs code was creating Vmm with Vm embedded in it. To solve this, we break down the initialization of the Vmm object. We first create its individual parts (Vm, Kvm and DeviceManager), perform any necessary initialization logic on Vm and once this done add it within an Arc. Signed-off-by: Babis Chalios --- src/vmm/src/arch/aarch64/mod.rs | 23 +- src/vmm/src/arch/x86_64/mod.rs | 26 ++- src/vmm/src/builder.rs | 309 +++++++++++++++----------- src/vmm/src/device_manager/acpi.rs | 4 +- src/vmm/src/device_manager/mmio.rs | 13 +- src/vmm/src/device_manager/mod.rs | 12 + src/vmm/src/device_manager/persist.rs | 1 + src/vmm/src/lib.rs | 3 +- 8 files changed, 230 insertions(+), 161 deletions(-) diff --git a/src/vmm/src/arch/aarch64/mod.rs b/src/vmm/src/arch/aarch64/mod.rs index df6e712dcf5..a599db5dea7 100644 --- a/src/vmm/src/arch/aarch64/mod.rs +++ b/src/vmm/src/arch/aarch64/mod.rs @@ -32,7 +32,7 @@ use crate::utils::{align_up, u64_to_usize, usize_to_u64}; use crate::vmm_config::machine_config::MachineConfig; use crate::vstate::memory::{Address, Bytes, GuestAddress, GuestMemory, GuestMemoryMmap}; use crate::vstate::vcpu::KvmVcpuError; -use crate::{Vcpu, VcpuConfig, Vmm, logger}; +use crate::{DeviceManager, Kvm, Vcpu, VcpuConfig, Vm, logger}; /// Errors thrown while configuring aarch64 system. #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -82,8 +82,11 @@ pub fn arch_memory_regions(size: usize) -> Vec<(GuestAddress, usize)> { } /// Configures the system for booting Linux. +#[allow(clippy::too_many_arguments)] pub fn configure_system_for_boot( - vmm: &mut Vmm, + kvm: &Kvm, + vm: &Vm, + device_manager: &mut DeviceManager, vcpus: &mut [Vcpu], machine_config: &MachineConfig, cpu_template: &CustomCpuTemplate, @@ -103,11 +106,11 @@ pub fn configure_system_for_boot( cpu_config, }; - let optional_capabilities = vmm.kvm.optional_capabilities(); + let optional_capabilities = kvm.optional_capabilities(); // Configure vCPUs with normalizing and setting the generated CPU configuration. for vcpu in vcpus.iter_mut() { vcpu.kvm_vcpu.configure( - vmm.vm.guest_memory(), + vm.guest_memory(), entry_point, &vcpu_config, &optional_capabilities, @@ -123,18 +126,16 @@ pub fn configure_system_for_boot( .expect("Cannot create cstring from cmdline string"); let fdt = fdt::create_fdt( - vmm.vm.guest_memory(), + vm.guest_memory(), vcpu_mpidr, cmdline, - &vmm.device_manager, - vmm.vm.get_irqchip(), + device_manager, + vm.get_irqchip(), initrd, )?; - let fdt_address = GuestAddress(get_fdt_addr(vmm.vm.guest_memory())); - vmm.vm - .guest_memory() - .write_slice(fdt.as_slice(), fdt_address)?; + let fdt_address = GuestAddress(get_fdt_addr(vm.guest_memory())); + vm.guest_memory().write_slice(fdt.as_slice(), fdt_address)?; Ok(()) } diff --git a/src/vmm/src/arch/x86_64/mod.rs b/src/vmm/src/arch/x86_64/mod.rs index fe1296e5d1c..68b903d5ff6 100644 --- a/src/vmm/src/arch/x86_64/mod.rs +++ b/src/vmm/src/arch/x86_64/mod.rs @@ -33,6 +33,7 @@ pub mod generated; use std::fs::File; +use kvm::Kvm; use layout::{ CMDLINE_START, FIRST_ADDR_PAST_32BITS, FIRST_ADDR_PAST_64BITS_MMIO, MMIO32_MEM_SIZE, MMIO32_MEM_START, MMIO64_MEM_SIZE, MMIO64_MEM_START, PCI_MMCONFIG_SIZE, PCI_MMCONFIG_START, @@ -53,6 +54,7 @@ use crate::acpi::create_acpi_tables; use crate::arch::{BootProtocol, SYSTEM_MEM_SIZE, SYSTEM_MEM_START, arch_memory_regions_with_gap}; use crate::cpu_config::templates::{CustomCpuTemplate, GuestConfigError}; use crate::cpu_config::x86_64::CpuConfiguration; +use crate::device_manager::DeviceManager; use crate::initrd::InitrdConfig; use crate::utils::{align_down, u64_to_usize, usize_to_u64}; use crate::vmm_config::machine_config::MachineConfig; @@ -60,7 +62,7 @@ use crate::vstate::memory::{ Address, GuestAddress, GuestMemory, GuestMemoryMmap, GuestMemoryRegion, }; use crate::vstate::vcpu::KvmVcpuConfigureError; -use crate::{Vcpu, VcpuConfig, Vmm, logger}; +use crate::{Vcpu, VcpuConfig, Vm, logger}; // Value taken from https://elixir.bootlin.com/linux/v5.10.68/source/arch/x86/include/uapi/asm/e820.h#L31 // Usable normal RAM @@ -169,8 +171,11 @@ pub fn initrd_load_addr(guest_mem: &GuestMemoryMmap, initrd_size: usize) -> Opti } /// Configures the system for booting Linux. +#[allow(clippy::too_many_arguments)] pub fn configure_system_for_boot( - vmm: &mut Vmm, + kvm: &Kvm, + vm: &Vm, + device_manager: &mut DeviceManager, vcpus: &mut [Vcpu], machine_config: &MachineConfig, cpu_template: &CustomCpuTemplate, @@ -179,8 +184,7 @@ pub fn configure_system_for_boot( boot_cmdline: Cmdline, ) -> Result<(), ConfigurationError> { // Construct the base CpuConfiguration to apply CPU template onto. - let cpu_config = - CpuConfiguration::new(vmm.kvm.supported_cpuid.clone(), cpu_template, &vcpus[0])?; + let cpu_config = CpuConfiguration::new(kvm.supported_cpuid.clone(), cpu_template, &vcpus[0])?; // Apply CPU template to the base CpuConfiguration. let cpu_config = CpuConfiguration::apply_template(cpu_config, cpu_template)?; @@ -193,7 +197,7 @@ pub fn configure_system_for_boot( // Configure vCPUs with normalizing and setting the generated CPU configuration. for vcpu in vcpus.iter_mut() { vcpu.kvm_vcpu - .configure(vmm.vm.guest_memory(), entry_point, &vcpu_config)?; + .configure(vm.guest_memory(), entry_point, &vcpu_config)?; } // Write the kernel command line to guest memory. This is x86_64 specific, since on @@ -204,7 +208,7 @@ pub fn configure_system_for_boot( .expect("Cannot create cstring from cmdline string"); load_cmdline( - vmm.vm.guest_memory(), + vm.guest_memory(), GuestAddress(crate::arch::x86_64::layout::CMDLINE_START), &boot_cmdline, ) @@ -212,19 +216,19 @@ pub fn configure_system_for_boot( // Note that this puts the mptable at the last 1k of Linux's 640k base RAM mptable::setup_mptable( - vmm.vm.guest_memory(), - &vmm.device_manager.resource_allocator, + vm.guest_memory(), + &device_manager.resource_allocator, vcpu_config.vcpu_count, ) .map_err(ConfigurationError::MpTableSetup)?; match entry_point.protocol { BootProtocol::PvhBoot => { - configure_pvh(vmm.vm.guest_memory(), GuestAddress(CMDLINE_START), initrd)?; + configure_pvh(vm.guest_memory(), GuestAddress(CMDLINE_START), initrd)?; } BootProtocol::LinuxBoot => { configure_64bit_boot( - vmm.vm.guest_memory(), + vm.guest_memory(), GuestAddress(CMDLINE_START), cmdline_size, initrd, @@ -234,7 +238,7 @@ pub fn configure_system_for_boot( // Create ACPI tables and write them in guest memory // For the time being we only support ACPI in x86_64 - create_acpi_tables(vmm.vm.guest_memory(), &mut vmm.device_manager, vcpus)?; + create_acpi_tables(vm.guest_memory(), device_manager, vcpus)?; Ok(()) } diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 2c037fc529f..290cf000c5e 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -16,17 +16,18 @@ use utils::time::TimestampUs; #[cfg(target_arch = "aarch64")] use vm_memory::GuestAddress; +#[cfg(target_arch = "aarch64")] +use crate::Vcpu; use crate::arch::{ConfigurationError, configure_system_for_boot, load_kernel}; #[cfg(target_arch = "aarch64")] use crate::construct_kvm_mpidrs; -use crate::cpu_config::templates::{ - GetCpuTemplate, GetCpuTemplateError, GuestConfigError, KvmCapability, -}; +use crate::cpu_config::templates::{GetCpuTemplate, GetCpuTemplateError, GuestConfigError}; #[cfg(target_arch = "aarch64")] use crate::device_manager::AttachLegacyMmioDeviceError; use crate::device_manager::pci_mngr::PciManagerError; use crate::device_manager::{ - AttachMmioDeviceError, AttachVmgenidError, DeviceManager, DevicePersistError, DeviceRestoreArgs, + AttachMmioDeviceError, AttachVmgenidError, DeviceManager, DeviceManagerCreateError, + DevicePersistError, DeviceRestoreArgs, }; use crate::devices::acpi::vmgenid::VmGenIdError; use crate::devices::virtio::balloon::Balloon; @@ -43,10 +44,10 @@ use crate::resources::VmResources; use crate::seccomp::BpfThreadMap; use crate::vmm_config::instance_info::InstanceInfo; use crate::vmm_config::machine_config::MachineConfigError; -use crate::vstate::kvm::Kvm; +use crate::vstate::kvm::{Kvm, KvmError}; use crate::vstate::memory::GuestRegionMmap; -use crate::vstate::vcpu::{Vcpu, VcpuError}; -use crate::vstate::vm::Vm; +use crate::vstate::vcpu::VcpuError; +use crate::vstate::vm::{Vm, VmError}; use crate::{EventManager, Vmm, VmmError, device_manager}; /// Errors associated with starting the instance. @@ -61,6 +62,8 @@ pub enum StartMicrovmError { AttachLegacyDevices(#[from] AttachLegacyMmioDeviceError), /// System configuration error: {0} ConfigureSystem(#[from] ConfigurationError), + /// Failed to create device manager: {0} + CreateDeviceManager(#[from] DeviceManagerCreateError), /// Failed to create guest config: {0} CreateGuestConfig(#[from] GuestConfigError), /// Cannot create network device: {0} @@ -87,6 +90,8 @@ pub enum StartMicrovmError { GetCpuTemplate(#[from] GetCpuTemplateError), /// Invalid kernel command line: {0} KernelCmdline(String), + /// Kvm error: {0} + Kvm(#[from] KvmError), /// Cannot load command line string: {0} LoadCommandline(linux_loader::loader::Error), /// Cannot start microvm without kernel configuration. @@ -115,6 +120,8 @@ pub enum StartMicrovmError { /// Error cloning Vcpu fds #[cfg(feature = "gdb")] VcpuFdCloneError(#[from] crate::vstate::vcpu::CopyKvmFdError), + /// Error with the Vm object: {0} + Vm(#[from] VmError), } /// It's convenient to automatically convert `linux_loader::cmdline::Error`s @@ -125,37 +132,6 @@ impl std::convert::From for StartMicrovmError { } } -#[cfg_attr(target_arch = "aarch64", allow(unused))] -fn create_vmm_and_vcpus( - instance_info: &InstanceInfo, - event_manager: &mut EventManager, - vcpu_count: u8, - kvm_capabilities: Vec, -) -> Result<(Vmm, Vec), VmmError> { - let kvm = Kvm::new(kvm_capabilities)?; - // Set up Kvm Vm and register memory regions. - // Build custom CPU config if a custom template is provided. - let mut vm = Vm::new(&kvm)?; - - let (vcpus, vcpus_exit_evt) = vm.create_vcpus(vcpu_count)?; - - let device_manager = DeviceManager::new(event_manager, &vcpus_exit_evt, vm.fd())?; - - let vmm = Vmm { - events_observer: Some(std::io::stdin()), - instance_info: instance_info.clone(), - shutdown_exit_code: None, - kvm, - vm, - uffd: None, - vcpus_handles: Vec::new(), - vcpus_exit_evt, - device_manager, - }; - - Ok((vmm, vcpus)) -} - /// Builds and starts a microVM based on the current Firecracker VmResources configuration. /// /// The built microVM and all the created vCPUs start off in the paused state. @@ -167,8 +143,6 @@ pub fn build_microvm_for_boot( event_manager: &mut EventManager, seccomp_filters: &BpfThreadMap, ) -> Result>, StartMicrovmError> { - use self::StartMicrovmError::*; - // Timestamp for measuring microVM boot duration. let request_ts = TimestampUs::default(); @@ -176,7 +150,7 @@ pub fn build_microvm_for_boot( .boot_source .builder .as_ref() - .ok_or(MissingKernelConfig)?; + .ok_or(StartMicrovmError::MissingKernelConfig)?; let guest_memory = vm_resources .allocate_guest_memory() @@ -191,19 +165,17 @@ pub fn build_microvm_for_boot( .cpu_template .get_cpu_template()?; - let (mut vmm, mut vcpus) = create_vmm_and_vcpus( - instance_info, - event_manager, - vm_resources.machine_config.vcpu_count, - cpu_template.kvm_capabilities.clone(), - )?; + let kvm = Kvm::new(cpu_template.kvm_capabilities.clone())?; + // Set up Kvm Vm and register memory regions. + // Build custom CPU config if a custom template is provided. + let mut vm = Vm::new(&kvm)?; + let (mut vcpus, vcpus_exit_evt) = vm.create_vcpus(vm_resources.machine_config.vcpu_count)?; + vm.register_memory_regions(guest_memory)?; - vmm.vm - .register_memory_regions(guest_memory) - .map_err(VmmError::Vm)?; + let mut device_manager = DeviceManager::new(event_manager, &vcpus_exit_evt, vm.fd())?; - let entry_point = load_kernel(&boot_config.kernel_file, vmm.vm.guest_memory())?; - let initrd = InitrdConfig::from_config(boot_config, vmm.vm.guest_memory())?; + let entry_point = load_kernel(&boot_config.kernel_file, vm.guest_memory())?; + let initrd = InitrdConfig::from_config(boot_config, vm.guest_memory())?; #[cfg(feature = "gdb")] let (gdb_tx, gdb_rx) = mpsc::channel(); @@ -214,11 +186,11 @@ pub fn build_microvm_for_boot( #[cfg(feature = "gdb")] let vcpu_fds = vcpus .iter() - .map(|vcpu| vcpu.copy_kvm_vcpu_fd(vmm.vm())) + .map(|vcpu| vcpu.copy_kvm_vcpu_fd(&vm)) .collect::, _>>()?; if vm_resources.pci_enabled { - vmm.device_manager.enable_pci()?; + device_manager.enable_pci()?; } else { boot_cmdline.insert("pci", "off")?; } @@ -227,53 +199,70 @@ pub fn build_microvm_for_boot( // to maintain the same MMIO address referenced in the documentation // and tests. if vm_resources.boot_timer { - vmm.device_manager.attach_boot_timer_device(request_ts)?; + device_manager.attach_boot_timer_device(request_ts)?; } if let Some(balloon) = vm_resources.balloon.get() { - attach_balloon_device(&mut vmm, &mut boot_cmdline, balloon, event_manager)?; + attach_balloon_device( + &mut device_manager, + &vm, + &mut boot_cmdline, + balloon, + event_manager, + )?; } attach_block_devices( - &mut vmm, + &mut device_manager, + &vm, &mut boot_cmdline, vm_resources.block.devices.iter(), event_manager, )?; attach_net_devices( - &mut vmm, + &mut device_manager, + &vm, &mut boot_cmdline, vm_resources.net_builder.iter(), event_manager, )?; if let Some(unix_vsock) = vm_resources.vsock.get() { - attach_unixsock_vsock_device(&mut vmm, &mut boot_cmdline, unix_vsock, event_manager)?; + attach_unixsock_vsock_device( + &mut device_manager, + &vm, + &mut boot_cmdline, + unix_vsock, + event_manager, + )?; } if let Some(entropy) = vm_resources.entropy.get() { - attach_entropy_device(&mut vmm, &mut boot_cmdline, entropy, event_manager)?; + attach_entropy_device( + &mut device_manager, + &vm, + &mut boot_cmdline, + entropy, + event_manager, + )?; } #[cfg(target_arch = "aarch64")] - vmm.device_manager.attach_legacy_devices_aarch64( - vmm.vm.fd(), - event_manager, - &mut boot_cmdline, - )?; + device_manager.attach_legacy_devices_aarch64(vm.fd(), event_manager, &mut boot_cmdline)?; - vmm.device_manager - .attach_vmgenid_device(vmm.vm.guest_memory(), vmm.vm.fd())?; + device_manager.attach_vmgenid_device(vm.guest_memory(), vm.fd())?; #[cfg(target_arch = "aarch64")] if vcpus[0].kvm_vcpu.supports_pvtime() { - setup_pvtime(&mut vmm, &mut vcpus)?; + setup_pvtime(&mut device_manager, &mut vcpus)?; } else { log::warn!("Vcpus do not support pvtime, steal time will not be reported to guest"); } configure_system_for_boot( - &mut vmm, + &kvm, + &vm, + &mut device_manager, vcpus.as_mut(), &vm_resources.machine_config, &cpu_template, @@ -282,6 +271,18 @@ pub fn build_microvm_for_boot( boot_cmdline, )?; + let vmm = Vmm { + events_observer: Some(std::io::stdin()), + instance_info: instance_info.clone(), + shutdown_exit_code: None, + kvm, + vm: Arc::new(vm), + uffd: None, + vcpus_handles: Vec::new(), + vcpus_exit_evt, + device_manager, + }; + let vmm = Arc::new(Mutex::new(vmm)); #[cfg(feature = "gdb")] @@ -293,7 +294,7 @@ pub fn build_microvm_for_boot( entry_point.entry_addr, gdb_socket_path, ) - .map_err(GdbServer)?; + .map_err(StartMicrovmError::GdbServer)?; } else { debug!("No GDB socket provided not starting gdb server."); } @@ -305,7 +306,7 @@ pub fn build_microvm_for_boot( vcpus, seccomp_filters .get("vcpu") - .ok_or_else(|| MissingSeccompFilters("vcpu".to_string()))? + .ok_or_else(|| StartMicrovmError::MissingSeccompFilters("vcpu".to_string()))? .clone(), ) .map_err(VmmError::VcpuStart)?; @@ -317,7 +318,7 @@ pub fn build_microvm_for_boot( crate::seccomp::apply_filter( seccomp_filters .get("vmm") - .ok_or_else(|| MissingSeccompFilters("vmm".to_string()))?, + .ok_or_else(|| StartMicrovmError::MissingSeccompFilters("vmm".to_string()))?, ) .map_err(VmmError::SeccompFilters)?; @@ -402,19 +403,21 @@ pub fn build_microvm_from_snapshot( ) -> Result>, BuildMicrovmFromSnapshotError> { // Build Vmm. debug!("event_start: build microvm from snapshot"); - let (mut vmm, mut vcpus) = create_vmm_and_vcpus( - instance_info, - event_manager, - vm_resources.machine_config.vcpu_count, - microvm_state.kvm_state.kvm_cap_modifiers.clone(), - ) - .map_err(StartMicrovmError::Internal)?; - vmm.vm - .register_memory_regions(guest_memory) - .map_err(VmmError::Vm) - .map_err(StartMicrovmError::Internal)?; - vmm.uffd = uffd; + let kvm = Kvm::new(microvm_state.kvm_state.kvm_cap_modifiers.clone()) + .map_err(StartMicrovmError::Kvm)?; + // Set up Kvm Vm and register memory regions. + // Build custom CPU config if a custom template is provided. + let mut vm = Vm::new(&kvm).map_err(StartMicrovmError::Vm)?; + + let (mut vcpus, vcpus_exit_evt) = vm + .create_vcpus(vm_resources.machine_config.vcpu_count) + .map_err(StartMicrovmError::Vm)?; + + let mut device_manager = DeviceManager::new(event_manager, &vcpus_exit_evt, vm.fd()).unwrap(); + + vm.register_memory_regions(guest_memory) + .map_err(StartMicrovmError::Vm)?; #[cfg(target_arch = "x86_64")] { @@ -434,7 +437,7 @@ pub fn build_microvm_from_snapshot( #[cfg(target_arch = "aarch64")] if let Some(pvtime_ipa) = vcpus[0].kvm_vcpu.pvtime_ipa { allocate_pvtime_region( - &mut vmm, + &mut device_manager, vcpus.len(), vm_allocator::AllocPolicy::ExactMatch(pvtime_ipa.0), )?; @@ -452,28 +455,39 @@ pub fn build_microvm_from_snapshot( { let mpidrs = construct_kvm_mpidrs(µvm_state.vcpu_states); // Restore kvm vm state. - vmm.vm.restore_state(&mpidrs, µvm_state.vm_state)?; + vm.restore_state(&mpidrs, µvm_state.vm_state)?; } // Restore kvm vm state. #[cfg(target_arch = "x86_64")] - vmm.vm.restore_state(µvm_state.vm_state)?; + vm.restore_state(µvm_state.vm_state)?; // Restore the boot source config paths. vm_resources.boot_source.config = microvm_state.vm_info.boot_source; // Restore devices states. let device_ctor_args = DeviceRestoreArgs { - mem: vmm.vm.guest_memory(), - vm: vmm.vm.fd(), + mem: vm.guest_memory(), + vm: vm.fd(), event_manager, vm_resources, instance_id: &instance_info.id, - restored_from_file: vmm.uffd.is_none(), + restored_from_file: uffd.is_none(), }; - vmm.device_manager - .restore(µvm_state.device_states, device_ctor_args)?; + device_manager.restore(µvm_state.device_states, device_ctor_args)?; + + let mut vmm = Vmm { + events_observer: Some(std::io::stdin()), + instance_info: instance_info.clone(), + shutdown_exit_code: None, + kvm, + vm: Arc::new(vm), + uffd, + vcpus_handles: Vec::new(), + vcpus_exit_evt, + device_manager, + }; // Move vcpus to their own threads and start their state machine in the 'Paused' state. vmm.start_vcpus( @@ -506,13 +520,12 @@ const STEALTIME_STRUCT_MEM_SIZE: u64 = 64; /// Helper method to allocate steal time region #[cfg(target_arch = "aarch64")] fn allocate_pvtime_region( - vmm: &mut Vmm, + device_manager: &mut DeviceManager, vcpu_count: usize, policy: vm_allocator::AllocPolicy, ) -> Result { let size = STEALTIME_STRUCT_MEM_SIZE * vcpu_count as u64; - let addr = vmm - .device_manager + let addr = device_manager .resource_allocator .allocate_system_memory(size, STEALTIME_STRUCT_MEM_SIZE, policy) .map_err(StartMicrovmError::AllocateResources)?; @@ -521,10 +534,16 @@ fn allocate_pvtime_region( /// Sets up pvtime for all vcpus #[cfg(target_arch = "aarch64")] -fn setup_pvtime(vmm: &mut Vmm, vcpus: &mut [Vcpu]) -> Result<(), StartMicrovmError> { +fn setup_pvtime( + device_manager: &mut DeviceManager, + vcpus: &mut [Vcpu], +) -> Result<(), StartMicrovmError> { // Alloc sys mem for steal time region - let pvtime_mem: GuestAddress = - allocate_pvtime_region(vmm, vcpus.len(), vm_allocator::AllocPolicy::LastMatch)?; + let pvtime_mem: GuestAddress = allocate_pvtime_region( + device_manager, + vcpus.len(), + vm_allocator::AllocPolicy::LastMatch, + )?; // Register all vcpus with pvtime device for (i, vcpu) in vcpus.iter_mut().enumerate() { @@ -539,7 +558,8 @@ fn setup_pvtime(vmm: &mut Vmm, vcpus: &mut [Vcpu]) -> Result<(), StartMicrovmErr } fn attach_entropy_device( - vmm: &mut Vmm, + device_manager: &mut DeviceManager, + vm: &Vm, cmdline: &mut LoaderKernelCmdline, entropy_device: &Arc>, event_manager: &mut EventManager, @@ -551,9 +571,9 @@ fn attach_entropy_device( .to_string(); event_manager.add_subscriber(entropy_device.clone()); - vmm.device_manager.attach_virtio_device( - vmm.vm.guest_memory(), - vmm.vm.fd(), + device_manager.attach_virtio_device( + vm.guest_memory(), + vm.fd(), id, entropy_device.clone(), cmdline, @@ -562,7 +582,8 @@ fn attach_entropy_device( } fn attach_block_devices<'a, I: Iterator>> + Debug>( - vmm: &mut Vmm, + device_manager: &mut DeviceManager, + vm: &Vm, cmdline: &mut LoaderKernelCmdline, blocks: I, event_manager: &mut EventManager, @@ -584,9 +605,9 @@ fn attach_block_devices<'a, I: Iterator>> + Debug>( }; // The device mutex mustn't be locked here otherwise it will deadlock. event_manager.add_subscriber(block.clone()); - vmm.device_manager.attach_virtio_device( - vmm.vm.guest_memory(), - vmm.vm.fd(), + device_manager.attach_virtio_device( + vm.guest_memory(), + vm.fd(), id, block.clone(), cmdline, @@ -597,7 +618,8 @@ fn attach_block_devices<'a, I: Iterator>> + Debug>( } fn attach_net_devices<'a, I: Iterator>> + Debug>( - vmm: &mut Vmm, + device_manager: &mut DeviceManager, + vm: &Vm, cmdline: &mut LoaderKernelCmdline, net_devices: I, event_manager: &mut EventManager, @@ -606,9 +628,9 @@ fn attach_net_devices<'a, I: Iterator>> + Debug>( let id = net_device.lock().expect("Poisoned lock").id().clone(); event_manager.add_subscriber(net_device.clone()); // The device mutex mustn't be locked here otherwise it will deadlock. - vmm.device_manager.attach_virtio_device( - vmm.vm.guest_memory(), - vmm.vm.fd(), + device_manager.attach_virtio_device( + vm.guest_memory(), + vm.fd(), id, net_device.clone(), cmdline, @@ -619,7 +641,8 @@ fn attach_net_devices<'a, I: Iterator>> + Debug>( } fn attach_unixsock_vsock_device( - vmm: &mut Vmm, + device_manager: &mut DeviceManager, + vm: &Vm, cmdline: &mut LoaderKernelCmdline, unix_vsock: &Arc>>, event_manager: &mut EventManager, @@ -627,9 +650,9 @@ fn attach_unixsock_vsock_device( let id = String::from(unix_vsock.lock().expect("Poisoned lock").id()); event_manager.add_subscriber(unix_vsock.clone()); // The device mutex mustn't be locked here otherwise it will deadlock. - vmm.device_manager.attach_virtio_device( - vmm.vm.guest_memory(), - vmm.vm.fd(), + device_manager.attach_virtio_device( + vm.guest_memory(), + vm.fd(), id, unix_vsock.clone(), cmdline, @@ -638,7 +661,8 @@ fn attach_unixsock_vsock_device( } fn attach_balloon_device( - vmm: &mut Vmm, + device_manager: &mut DeviceManager, + vm: &Vm, cmdline: &mut LoaderKernelCmdline, balloon: &Arc>, event_manager: &mut EventManager, @@ -646,9 +670,9 @@ fn attach_balloon_device( let id = String::from(balloon.lock().expect("Poisoned lock").id()); event_manager.add_subscriber(balloon.clone()); // The device mutex mustn't be locked here otherwise it will deadlock. - vmm.device_manager.attach_virtio_device( - vmm.vm.guest_memory(), - vmm.vm.fd(), + device_manager.attach_virtio_device( + vm.guest_memory(), + vm.fd(), id, balloon.clone(), cmdline, @@ -743,7 +767,7 @@ pub(crate) mod tests { instance_info: InstanceInfo::default(), shutdown_exit_code: None, kvm, - vm, + vm: Arc::new(vm), uffd: None, vcpus_handles: Vec::new(), vcpus_exit_evt, @@ -788,7 +812,8 @@ pub(crate) mod tests { } attach_block_devices( - vmm, + &mut vmm.device_manager, + &vmm.vm, cmdline, block_dev_configs.devices.iter(), event_manager, @@ -806,7 +831,13 @@ pub(crate) mod tests { let mut net_builder = NetBuilder::new(); net_builder.build(net_config).unwrap(); - let res = attach_net_devices(vmm, cmdline, net_builder.iter(), event_manager); + let res = attach_net_devices( + &mut vmm.device_manager, + &vmm.vm, + cmdline, + net_builder.iter(), + event_manager, + ); res.unwrap(); } @@ -827,7 +858,14 @@ pub(crate) mod tests { Arc::new(Mutex::new(mmds)), ); - attach_net_devices(vmm, cmdline, net_builder.iter(), event_manager).unwrap(); + attach_net_devices( + &mut vmm.device_manager, + &vmm.vm, + cmdline, + net_builder.iter(), + event_manager, + ) + .unwrap(); } pub(crate) fn insert_vsock_device( @@ -840,7 +878,14 @@ pub(crate) mod tests { let vsock = VsockBuilder::create_unixsock_vsock(vsock_config).unwrap(); let vsock = Arc::new(Mutex::new(vsock)); - attach_unixsock_vsock_device(vmm, cmdline, &vsock, event_manager).unwrap(); + attach_unixsock_vsock_device( + &mut vmm.device_manager, + &vmm.vm, + cmdline, + &vsock, + event_manager, + ) + .unwrap(); assert!( vmm.device_manager @@ -859,7 +904,14 @@ pub(crate) mod tests { let mut builder = EntropyDeviceBuilder::new(); let entropy = builder.build(entropy_config).unwrap(); - attach_entropy_device(vmm, cmdline, &entropy, event_manager).unwrap(); + attach_entropy_device( + &mut vmm.device_manager, + &vmm.vm, + cmdline, + &entropy, + event_manager, + ) + .unwrap(); assert!( vmm.device_manager @@ -887,7 +939,14 @@ pub(crate) mod tests { builder.set(balloon_config).unwrap(); let balloon = builder.get().unwrap(); - attach_balloon_device(vmm, cmdline, balloon, event_manager).unwrap(); + attach_balloon_device( + &mut vmm.device_manager, + &vmm.vm, + cmdline, + balloon, + event_manager, + ) + .unwrap(); assert!( vmm.device_manager diff --git a/src/vmm/src/device_manager/acpi.rs b/src/vmm/src/device_manager/acpi.rs index 78f1254d2fa..8a447c4c065 100644 --- a/src/vmm/src/device_manager/acpi.rs +++ b/src/vmm/src/device_manager/acpi.rs @@ -6,7 +6,7 @@ use kvm_ioctls::VmFd; use crate::devices::acpi::vmgenid::VmGenId; -#[derive(Debug)] +#[derive(Debug, Default)] pub struct ACPIDeviceManager { /// VMGenID device pub vmgenid: Option, @@ -15,7 +15,7 @@ pub struct ACPIDeviceManager { impl ACPIDeviceManager { /// Create a new ACPIDeviceManager object pub fn new() -> Self { - Self { vmgenid: None } + Default::default() } /// Attach a new VMGenID device to the microVM diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index deb07ad9f91..191576b59fd 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -128,7 +128,7 @@ pub struct MMIODevice { } /// Manages the complexities of registering a MMIO device. -#[derive(Debug)] +#[derive(Debug, Default)] pub struct MMIODeviceManager { /// VirtIO devices using an MMIO transport layer pub(crate) virtio_devices: HashMap<(u32, String), MMIODevice>, @@ -154,16 +154,7 @@ pub struct MMIODeviceManager { impl MMIODeviceManager { /// Create a new DeviceManager handling mmio devices (virtio net, block). pub fn new() -> MMIODeviceManager { - MMIODeviceManager { - virtio_devices: HashMap::new(), - boot_timer: None, - #[cfg(target_arch = "aarch64")] - rtc: None, - #[cfg(target_arch = "aarch64")] - serial: None, - #[cfg(target_arch = "x86_64")] - dsdt_data: vec![], - } + Default::default() } /// Allocates resources for a new device to be added. diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index 2922060bb13..5457b22e39d 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -294,6 +294,18 @@ pub struct DeviceRestoreArgs<'a> { pub restored_from_file: bool, } +impl std::fmt::Debug for DeviceRestoreArgs<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("DeviceRestoreArgs") + .field("mem", &self.mem) + .field("vm", &self.vm) + .field("vm_resources", &self.vm_resources) + .field("instance_id", &self.instance_id) + .field("restored_from_file", &self.restored_from_file) + .finish() + } +} + impl DeviceManager { pub fn save(&self) -> DevicesState { DevicesState { diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index e3c7d2a8475..f267212ba2e 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -240,6 +240,7 @@ pub struct ACPIDeviceManagerState { vmgenid: Option, } +#[derive(Debug)] pub struct ACPIDeviceManagerConstructorArgs<'a> { pub mem: &'a GuestMemoryMmap, pub resource_allocator: &'a ResourceAllocator, diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 01ef9547d82..18177367ada 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -299,8 +299,9 @@ pub struct Vmm { // Guest VM core resources. kvm: Kvm, /// VM object - pub vm: Vm, + pub vm: Arc, // Save UFFD in order to keep it open in the Firecracker process, as well. + #[allow(unused)] uffd: Option, vcpus_handles: Vec, // Used by Vcpus and devices to initiate teardown; Vmm should never write here. From 6ea14631bd934c6e9e295a39e1a1dbe257caacf4 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Mon, 2 Jun 2025 19:15:23 +0200 Subject: [PATCH 36/56] vm: track device interrupts within Vm object Add logic to track the device interrupts used by the microVM. This is not strictly needed right now, but we will need it when adding support for MSI-X interrupts. MSI-X interrupts are configured at runtime and we need to interact with KVM to set the interruput routes. To do it, we need to keep track all of the interrupts the VM is using. Signed-off-by: Babis Chalios --- src/vmm/src/arch/aarch64/fdt.rs | 27 ++++++----- src/vmm/src/builder.rs | 57 +++++------------------ src/vmm/src/device_manager/acpi.rs | 10 ++-- src/vmm/src/device_manager/legacy.rs | 15 +++--- src/vmm/src/device_manager/mmio.rs | 28 +++++------ src/vmm/src/device_manager/mod.rs | 29 ++++++------ src/vmm/src/device_manager/persist.rs | 9 ++-- src/vmm/src/vstate/vm.rs | 67 ++++++++++++++++++++++++++- 8 files changed, 131 insertions(+), 111 deletions(-) diff --git a/src/vmm/src/arch/aarch64/fdt.rs b/src/vmm/src/arch/aarch64/fdt.rs index 8e67a50bd64..a2a4992eb29 100644 --- a/src/vmm/src/arch/aarch64/fdt.rs +++ b/src/vmm/src/arch/aarch64/fdt.rs @@ -499,17 +499,16 @@ mod tests { use std::ffi::CString; use std::sync::{Arc, Mutex}; - use kvm_ioctls::Kvm; use linux_loader::cmdline as kernel_cmdline; use super::*; - use crate::EventManager; use crate::arch::aarch64::gic::create_gic; use crate::arch::aarch64::layout; use crate::device_manager::mmio::tests::DummyDevice; use crate::device_manager::tests::default_device_manager; use crate::test_utils::arch_mem; use crate::vstate::memory::GuestAddress; + use crate::{EventManager, Kvm, Vm}; // The `load` function from the `device_tree` will mistakenly check the actual size // of the buffer with the allocated size. This works around that. @@ -525,9 +524,9 @@ mod tests { let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); let mut event_manager = EventManager::new().unwrap(); let mut device_manager = default_device_manager(); - let kvm = Kvm::new().unwrap(); - let vm = kvm.create_vm().unwrap(); - let gic = create_gic(&vm, 1, None).unwrap(); + let kvm = Kvm::new(vec![]).unwrap(); + let vm = Vm::new(&kvm).unwrap(); + let gic = create_gic(vm.fd(), 1, None).unwrap(); let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); cmdline.insert("console", "/dev/tty0").unwrap(); @@ -562,9 +561,9 @@ mod tests { fn test_create_fdt_with_vmgenid() { let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); let mut device_manager = default_device_manager(); - let kvm = Kvm::new().unwrap(); - let vm = kvm.create_vm().unwrap(); - let gic = create_gic(&vm, 1, None).unwrap(); + let kvm = Kvm::new(vec![]).unwrap(); + let vm = Vm::new(&kvm).unwrap(); + let gic = create_gic(vm.fd(), 1, None).unwrap(); let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); cmdline.insert("console", "/dev/tty0").unwrap(); @@ -585,9 +584,9 @@ mod tests { fn test_create_fdt() { let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); let device_manager = default_device_manager(); - let kvm = Kvm::new().unwrap(); - let vm = kvm.create_vm().unwrap(); - let gic = create_gic(&vm, 1, None).unwrap(); + let kvm = Kvm::new(vec![]).unwrap(); + let vm = Vm::new(&kvm).unwrap(); + let gic = create_gic(vm.fd(), 1, None).unwrap(); let saved_dtb_bytes = match gic.fdt_compatibility() { "arm,gic-v3" => include_bytes!("output_GICv3.dtb"), @@ -642,9 +641,9 @@ mod tests { fn test_create_fdt_with_initrd() { let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); let device_manager = default_device_manager(); - let kvm = Kvm::new().unwrap(); - let vm = kvm.create_vm().unwrap(); - let gic = create_gic(&vm, 1, None).unwrap(); + let kvm = Kvm::new(vec![]).unwrap(); + let vm = Vm::new(&kvm).unwrap(); + let gic = create_gic(vm.fd(), 1, None).unwrap(); let saved_dtb_bytes = match gic.fdt_compatibility() { "arm,gic-v3" => include_bytes!("output_initrd_GICv3.dtb"), diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 290cf000c5e..b0712abc3a5 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -172,7 +172,7 @@ pub fn build_microvm_for_boot( let (mut vcpus, vcpus_exit_evt) = vm.create_vcpus(vm_resources.machine_config.vcpu_count)?; vm.register_memory_regions(guest_memory)?; - let mut device_manager = DeviceManager::new(event_manager, &vcpus_exit_evt, vm.fd())?; + let mut device_manager = DeviceManager::new(event_manager, &vcpus_exit_evt, &vm)?; let entry_point = load_kernel(&boot_config.kernel_file, vm.guest_memory())?; let initrd = InitrdConfig::from_config(boot_config, vm.guest_memory())?; @@ -248,9 +248,9 @@ pub fn build_microvm_for_boot( } #[cfg(target_arch = "aarch64")] - device_manager.attach_legacy_devices_aarch64(vm.fd(), event_manager, &mut boot_cmdline)?; + device_manager.attach_legacy_devices_aarch64(&vm, event_manager, &mut boot_cmdline)?; - device_manager.attach_vmgenid_device(vm.guest_memory(), vm.fd())?; + device_manager.attach_vmgenid_device(vm.guest_memory(), &vm)?; #[cfg(target_arch = "aarch64")] if vcpus[0].kvm_vcpu.supports_pvtime() { @@ -414,7 +414,7 @@ pub fn build_microvm_from_snapshot( .create_vcpus(vm_resources.machine_config.vcpu_count) .map_err(StartMicrovmError::Vm)?; - let mut device_manager = DeviceManager::new(event_manager, &vcpus_exit_evt, vm.fd()).unwrap(); + let mut device_manager = DeviceManager::new(event_manager, &vcpus_exit_evt, &vm).unwrap(); vm.register_memory_regions(guest_memory) .map_err(StartMicrovmError::Vm)?; @@ -468,7 +468,7 @@ pub fn build_microvm_from_snapshot( // Restore devices states. let device_ctor_args = DeviceRestoreArgs { mem: vm.guest_memory(), - vm: vm.fd(), + vm: &vm, event_manager, vm_resources, instance_id: &instance_info.id, @@ -571,14 +571,7 @@ fn attach_entropy_device( .to_string(); event_manager.add_subscriber(entropy_device.clone()); - device_manager.attach_virtio_device( - vm.guest_memory(), - vm.fd(), - id, - entropy_device.clone(), - cmdline, - false, - ) + device_manager.attach_virtio_device(vm, id, entropy_device.clone(), cmdline, false) } fn attach_block_devices<'a, I: Iterator>> + Debug>( @@ -605,14 +598,7 @@ fn attach_block_devices<'a, I: Iterator>> + Debug>( }; // The device mutex mustn't be locked here otherwise it will deadlock. event_manager.add_subscriber(block.clone()); - device_manager.attach_virtio_device( - vm.guest_memory(), - vm.fd(), - id, - block.clone(), - cmdline, - is_vhost_user, - )?; + device_manager.attach_virtio_device(vm, id, block.clone(), cmdline, is_vhost_user)?; } Ok(()) } @@ -628,14 +614,7 @@ fn attach_net_devices<'a, I: Iterator>> + Debug>( let id = net_device.lock().expect("Poisoned lock").id().clone(); event_manager.add_subscriber(net_device.clone()); // The device mutex mustn't be locked here otherwise it will deadlock. - device_manager.attach_virtio_device( - vm.guest_memory(), - vm.fd(), - id, - net_device.clone(), - cmdline, - false, - )?; + device_manager.attach_virtio_device(vm, id, net_device.clone(), cmdline, false)?; } Ok(()) } @@ -650,14 +629,7 @@ fn attach_unixsock_vsock_device( let id = String::from(unix_vsock.lock().expect("Poisoned lock").id()); event_manager.add_subscriber(unix_vsock.clone()); // The device mutex mustn't be locked here otherwise it will deadlock. - device_manager.attach_virtio_device( - vm.guest_memory(), - vm.fd(), - id, - unix_vsock.clone(), - cmdline, - false, - ) + device_manager.attach_virtio_device(vm, id, unix_vsock.clone(), cmdline, false) } fn attach_balloon_device( @@ -670,14 +642,7 @@ fn attach_balloon_device( let id = String::from(balloon.lock().expect("Poisoned lock").id()); event_manager.add_subscriber(balloon.clone()); // The device mutex mustn't be locked here otherwise it will deadlock. - device_manager.attach_virtio_device( - vm.guest_memory(), - vm.fd(), - id, - balloon.clone(), - cmdline, - false, - ) + device_manager.attach_virtio_device(vm, id, balloon.clone(), cmdline, false) } #[cfg(test)] @@ -924,7 +889,7 @@ pub(crate) mod tests { #[cfg(target_arch = "x86_64")] pub(crate) fn insert_vmgenid_device(vmm: &mut Vmm) { vmm.device_manager - .attach_vmgenid_device(vmm.vm.guest_memory(), vmm.vm.fd()) + .attach_vmgenid_device(vmm.vm.guest_memory(), &vmm.vm) .unwrap(); assert!(vmm.device_manager.acpi_devices.vmgenid.is_some()); } diff --git a/src/vmm/src/device_manager/acpi.rs b/src/vmm/src/device_manager/acpi.rs index 8a447c4c065..3f0af80c7aa 100644 --- a/src/vmm/src/device_manager/acpi.rs +++ b/src/vmm/src/device_manager/acpi.rs @@ -2,8 +2,8 @@ // SPDX-License-Identifier: Apache-2.0 use acpi_tables::{Aml, aml}; -use kvm_ioctls::VmFd; +use crate::Vm; use crate::devices::acpi::vmgenid::VmGenId; #[derive(Debug, Default)] @@ -21,12 +21,8 @@ impl ACPIDeviceManager { /// Attach a new VMGenID device to the microVM /// /// This will register the device's interrupt with KVM - pub fn attach_vmgenid( - &mut self, - vmgenid: VmGenId, - vm_fd: &VmFd, - ) -> Result<(), kvm_ioctls::Error> { - vm_fd.register_irqfd(&vmgenid.interrupt_evt, vmgenid.gsi)?; + pub fn attach_vmgenid(&mut self, vmgenid: VmGenId, vm: &Vm) -> Result<(), kvm_ioctls::Error> { + vm.register_irq(&vmgenid.interrupt_evt, vmgenid.gsi)?; self.vmgenid = Some(vmgenid); Ok(()) } diff --git a/src/vmm/src/device_manager/legacy.rs b/src/vmm/src/device_manager/legacy.rs index cedb7abc32c..7011ae71122 100644 --- a/src/vmm/src/device_manager/legacy.rs +++ b/src/vmm/src/device_manager/legacy.rs @@ -11,11 +11,11 @@ use std::sync::{Arc, Mutex}; use acpi_tables::aml::AmlError; use acpi_tables::{Aml, aml}; -use kvm_ioctls::VmFd; use libc::EFD_NONBLOCK; use vm_superio::Serial; use vmm_sys_util::eventfd::EventFd; +use crate::Vm; use crate::devices::legacy::serial::SerialOut; use crate::devices::legacy::{EventFdTrigger, I8042Device, SerialDevice, SerialEventsWrapper}; @@ -100,7 +100,7 @@ impl PortIODeviceManager { pub fn register_devices( &mut self, io_bus: &vm_device::Bus, - vm_fd: &VmFd, + vm: &Vm, ) -> Result<(), LegacyDeviceError> { let serial_2_4 = Arc::new(Mutex::new(SerialDevice { serial: Serial::with_events( @@ -148,18 +148,15 @@ impl PortIODeviceManager { Self::I8042_KDB_DATA_REGISTER_SIZE, )?; - vm_fd - .register_irqfd(&self.com_evt_1_3, Self::COM_EVT_1_3_GSI) + vm.register_irq(&self.com_evt_1_3, Self::COM_EVT_1_3_GSI) .map_err(|e| { LegacyDeviceError::EventFd(std::io::Error::from_raw_os_error(e.errno())) })?; - vm_fd - .register_irqfd(&self.com_evt_2_4, Self::COM_EVT_2_4_GSI) + vm.register_irq(&self.com_evt_2_4, Self::COM_EVT_2_4_GSI) .map_err(|e| { LegacyDeviceError::EventFd(std::io::Error::from_raw_os_error(e.errno())) })?; - vm_fd - .register_irqfd(&self.kbd_evt, Self::KBD_EVT_GSI) + vm.register_irq(&self.kbd_evt, Self::KBD_EVT_GSI) .map_err(|e| { LegacyDeviceError::EventFd(std::io::Error::from_raw_os_error(e.errno())) })?; @@ -264,6 +261,6 @@ mod tests { )), ) .unwrap(); - ldm.register_devices(&io_bus, vm.fd()).unwrap(); + ldm.register_devices(&io_bus, &vm).unwrap(); } } diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 191576b59fd..5031e3104ba 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -12,7 +12,7 @@ use std::sync::{Arc, Mutex}; #[cfg(target_arch = "x86_64")] use acpi_tables::{Aml, aml}; -use kvm_ioctls::{IoEventAddress, VmFd}; +use kvm_ioctls::IoEventAddress; use linux_loader::cmdline as kernel_cmdline; #[cfg(target_arch = "x86_64")] use log::debug; @@ -21,6 +21,7 @@ use serde::{Deserialize, Serialize}; use vm_allocator::AllocPolicy; use super::resources::ResourceAllocator; +use crate::Vm; use crate::arch::BOOT_DEVICE_MEM_START; #[cfg(target_arch = "aarch64")] use crate::arch::{RTC_MEM_START, SERIAL_MEM_START}; @@ -184,7 +185,7 @@ impl MMIODeviceManager { /// Register a virtio-over-MMIO device to be used via MMIO transport at a specific slot. pub fn register_mmio_virtio( &mut self, - vm: &VmFd, + vm: &Vm, device_id: String, mmio_bus: &vm_device::Bus, device: MMIODevice, @@ -201,10 +202,11 @@ impl MMIODeviceManager { let io_addr = IoEventAddress::Mmio( device.resources.addr + u64::from(crate::devices::virtio::NOTIFY_REG_OFFSET), ); - vm.register_ioevent(queue_evt, &io_addr, u32::try_from(i).unwrap()) + vm.fd() + .register_ioevent(queue_evt, &io_addr, u32::try_from(i).unwrap()) .map_err(MmioError::RegisterIoEvent)?; } - vm.register_irqfd(&mmio_device.interrupt.irq_evt, irq.get()) + vm.register_irq(&mmio_device.interrupt.irq_evt, irq.get()) .map_err(MmioError::RegisterIrqFd)?; } @@ -243,7 +245,7 @@ impl MMIODeviceManager { /// to the boot cmdline. pub fn register_mmio_virtio_for_boot( &mut self, - vm: &VmFd, + vm: &Vm, resource_allocator: &ResourceAllocator, device_id: String, mmio_device: MmioTransport, @@ -275,7 +277,7 @@ impl MMIODeviceManager { /// otherwise allocate a new MMIO resources for it. pub fn register_mmio_serial( &mut self, - vm: &VmFd, + vm: &Vm, resource_allocator: &ResourceAllocator, serial: Arc>, device_info_opt: Option, @@ -293,7 +295,7 @@ impl MMIODeviceManager { } }; - vm.register_irqfd( + vm.register_irq( serial.lock().expect("Poisoned lock").serial.interrupt_evt(), device_info.irq.unwrap().get(), ) @@ -557,7 +559,7 @@ pub(crate) mod tests { impl MMIODeviceManager { pub(crate) fn register_virtio_test_device( &mut self, - vm: &VmFd, + vm: &Vm, guest_mem: GuestMemoryMmap, resource_allocator: &ResourceAllocator, device: Arc>, @@ -690,7 +692,7 @@ pub(crate) mod tests { device_manager .register_virtio_test_device( - vm.fd(), + &vm, vm.guest_memory().clone(), &resource_allocator, dummy, @@ -744,7 +746,7 @@ pub(crate) mod tests { for _i in crate::arch::IRQ_BASE..=crate::arch::IRQ_MAX { device_manager .register_virtio_test_device( - vm.fd(), + &vm, vm.guest_memory().clone(), &resource_allocator, Arc::new(Mutex::new(DummyDevice::new())), @@ -758,7 +760,7 @@ pub(crate) mod tests { "{}", device_manager .register_virtio_test_device( - vm.fd(), + &vm, vm.guest_memory().clone(), &resource_allocator, Arc::new(Mutex::new(DummyDevice::new())), @@ -803,7 +805,7 @@ pub(crate) mod tests { let id = String::from("foo"); let addr = device_manager .register_virtio_test_device( - vm.fd(), + &vm, vm.guest_memory().clone(), &resource_allocator, dummy, @@ -834,7 +836,7 @@ pub(crate) mod tests { let id2 = String::from("foo2"); device_manager .register_virtio_test_device( - vm.fd(), + &vm, vm.guest_memory().clone(), &resource_allocator, dummy2, diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index 5457b22e39d..a60a86ea7c3 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -10,7 +10,6 @@ use std::sync::{Arc, Mutex}; use acpi::ACPIDeviceManager; use event_manager::{MutEventSubscriber, SubscriberOps}; -use kvm_ioctls::VmFd; #[cfg(target_arch = "x86_64")] use legacy::{LegacyDeviceError, PortIODeviceManager}; use linux_loader::loader::Cmdline; @@ -36,7 +35,7 @@ use crate::devices::virtio::transport::mmio::{IrqTrigger, MmioTransport}; use crate::resources::VmResources; use crate::snapshot::Persist; use crate::vstate::memory::GuestMemoryMmap; -use crate::{EmulateSerialInitError, EventManager}; +use crate::{EmulateSerialInitError, EventManager, Vm}; /// ACPI device manager. pub mod acpi; @@ -143,7 +142,7 @@ impl DeviceManager { pub fn new( event_manager: &mut EventManager, vcpu_exit_evt: &EventFd, - vmfd: &VmFd, + vm: &Vm, ) -> Result { let resource_allocator = Arc::new(ResourceAllocator::new()?); #[cfg(target_arch = "x86_64")] @@ -160,7 +159,7 @@ impl DeviceManager { // create pio dev manager with legacy devices let mut legacy_devices = PortIODeviceManager::new(serial, i8042)?; - legacy_devices.register_devices(&resource_allocator.pio_bus, vmfd)?; + legacy_devices.register_devices(&resource_allocator.pio_bus, vm)?; legacy_devices }; @@ -177,8 +176,7 @@ impl DeviceManager { /// Attaches a VirtioDevice device to the device manager and event manager. pub(crate) fn attach_virtio_device( &mut self, - mem: &GuestMemoryMmap, - vmfd: &VmFd, + vm: &Vm, id: String, device: Arc>, cmdline: &mut Cmdline, @@ -186,9 +184,10 @@ impl DeviceManager { ) -> Result<(), AttachMmioDeviceError> { let interrupt = Arc::new(IrqTrigger::new()); // The device mutex mustn't be locked here otherwise it will deadlock. - let device = MmioTransport::new(mem.clone(), interrupt, device, is_vhost_user); + let device = + MmioTransport::new(vm.guest_memory().clone(), interrupt, device, is_vhost_user); self.mmio_devices.register_mmio_virtio_for_boot( - vmfd, + vm, &self.resource_allocator, id, device, @@ -214,17 +213,17 @@ impl DeviceManager { pub(crate) fn attach_vmgenid_device( &mut self, mem: &GuestMemoryMmap, - vmfd: &VmFd, + vm: &Vm, ) -> Result<(), AttachVmgenidError> { let vmgenid = VmGenId::new(mem, &self.resource_allocator)?; - self.acpi_devices.attach_vmgenid(vmgenid, vmfd)?; + self.acpi_devices.attach_vmgenid(vmgenid, vm)?; Ok(()) } #[cfg(target_arch = "aarch64")] pub(crate) fn attach_legacy_devices_aarch64( &mut self, - vmfd: &VmFd, + vm: &Vm, event_manager: &mut EventManager, cmdline: &mut Cmdline, ) -> Result<(), AttachLegacyMmioDeviceError> { @@ -241,7 +240,7 @@ impl DeviceManager { Self::set_stdout_nonblocking(); let serial = Self::setup_serial_device(event_manager)?; self.mmio_devices - .register_mmio_serial(vmfd, &self.resource_allocator, serial, None)?; + .register_mmio_serial(vm, &self.resource_allocator, serial, None)?; self.mmio_devices.add_mmio_serial_to_cmdline(cmdline)?; } @@ -287,7 +286,7 @@ pub enum DevicePersistError { pub struct DeviceRestoreArgs<'a> { pub mem: &'a GuestMemoryMmap, - pub vm: &'a VmFd, + pub vm: &'a Vm, pub event_manager: &'a mut EventManager, pub vm_resources: &'a mut VmResources, pub instance_id: &'a str, @@ -434,7 +433,7 @@ pub(crate) mod tests { let mut cmdline = Cmdline::new(4096).unwrap(); let mut event_manager = EventManager::new().unwrap(); vmm.device_manager - .attach_legacy_devices_aarch64(vmm.vm.fd(), &mut event_manager, &mut cmdline) + .attach_legacy_devices_aarch64(&vmm.vm, &mut event_manager, &mut cmdline) .unwrap(); assert!(vmm.device_manager.mmio_devices.rtc.is_some()); assert!(vmm.device_manager.mmio_devices.serial.is_none()); @@ -442,7 +441,7 @@ pub(crate) mod tests { let mut vmm = default_vmm(); cmdline.insert("console", "/dev/blah").unwrap(); vmm.device_manager - .attach_legacy_devices_aarch64(vmm.vm.fd(), &mut event_manager, &mut cmdline) + .attach_legacy_devices_aarch64(&vmm.vm, &mut event_manager, &mut cmdline) .unwrap(); assert!(vmm.device_manager.mmio_devices.rtc.is_some()); assert!(vmm.device_manager.mmio_devices.serial.is_some()); diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index f267212ba2e..6b1168ec965 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -7,7 +7,6 @@ use std::fmt::{self, Debug}; use std::sync::{Arc, Mutex}; use event_manager::{MutEventSubscriber, SubscriberOps}; -use kvm_ioctls::VmFd; use log::{error, warn}; use serde::{Deserialize, Serialize}; use vm_allocator::AllocPolicy; @@ -15,7 +14,6 @@ use vm_allocator::AllocPolicy; use super::acpi::ACPIDeviceManager; use super::mmio::*; use super::resources::ResourceAllocator; -use crate::EventManager; #[cfg(target_arch = "aarch64")] use crate::arch::DeviceType; use crate::devices::acpi::vmgenid::{VMGenIDState, VMGenIdConstructorArgs, VmGenId, VmGenIdError}; @@ -51,6 +49,7 @@ use crate::resources::{ResourcesError, VmResources}; use crate::snapshot::Persist; use crate::vmm_config::mmds::MmdsConfigError; use crate::vstate::memory::GuestMemoryMmap; +use crate::{EventManager, Vm}; /// Errors for (de)serialization of the MMIO device manager. #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -215,7 +214,7 @@ pub enum SharedDeviceType { pub struct MMIODevManagerConstructorArgs<'a> { pub mem: &'a GuestMemoryMmap, - pub vm: &'a VmFd, + pub vm: &'a Vm, pub event_manager: &'a mut EventManager, pub resource_allocator: &'a ResourceAllocator, pub vm_resources: &'a mut VmResources, @@ -244,7 +243,7 @@ pub struct ACPIDeviceManagerState { pub struct ACPIDeviceManagerConstructorArgs<'a> { pub mem: &'a GuestMemoryMmap, pub resource_allocator: &'a ResourceAllocator, - pub vm: &'a VmFd, + pub vm: &'a Vm, } #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -817,7 +816,7 @@ mod tests { let vm_resources = &mut VmResources::default(); let restore_args = MMIODevManagerConstructorArgs { mem: vmm.vm.guest_memory(), - vm: vmm.vm.fd(), + vm: &vmm.vm, event_manager: &mut event_manager, resource_allocator: &resource_allocator, vm_resources, diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index 7a8965a4b9a..cf8879df033 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -9,10 +9,16 @@ use std::collections::HashMap; use std::fs::OpenOptions; use std::io::Write; use std::path::Path; -use std::sync::Arc; +use std::sync::{Arc, Mutex}; -use kvm_bindings::{KVM_MEM_LOG_DIRTY_PAGES, kvm_userspace_memory_region}; +#[cfg(target_arch = "x86_64")] +use kvm_bindings::KVM_IRQCHIP_IOAPIC; +use kvm_bindings::{ + KVM_IRQ_ROUTING_IRQCHIP, KVM_MEM_LOG_DIRTY_PAGES, kvm_irq_routing_entry, + kvm_userspace_memory_region, +}; use kvm_ioctls::VmFd; +use vmm_sys_util::errno; use vmm_sys_util::eventfd::EventFd; pub use crate::arch::{ArchVm as Vm, ArchVmError, VmState}; @@ -26,6 +32,26 @@ use crate::vstate::memory::{ use crate::vstate::vcpu::VcpuError; use crate::{DirtyBitmap, Vcpu, mem_size_mib}; +#[derive(Debug, thiserror::Error, displaydoc::Display)] +/// Errors related with Firecracker interrupts +pub enum InterruptError { + /// Error allocating resources: {0} + Allocator(#[from] vm_allocator::Error), + /// EventFd error: {0} + EventFd(std::io::Error), + /// FamStruct error: {0} + FamStruct(#[from] vmm_sys_util::fam::Error), + /// KVM error: {0} + Kvm(#[from] kvm_ioctls::Error), +} + +#[derive(Debug)] +/// A struct representing an interrupt line used by some device of the microVM +pub struct RoutingEntry { + entry: kvm_irq_routing_entry, + masked: bool, +} + /// Architecture independent parts of a VM. #[derive(Debug)] pub struct VmCommon { @@ -34,6 +60,8 @@ pub struct VmCommon { max_memslots: usize, /// The guest memory of this Vm. pub guest_memory: GuestMemoryMmap, + /// Interrupts used by Vm's devices + pub interrupts: Mutex>, } /// Errors associated with the wrappers over KVM ioctls. @@ -101,6 +129,7 @@ impl Vm { fd, max_memslots: kvm.max_nr_memslots(), guest_memory: GuestMemoryMmap::default(), + interrupts: Mutex::new(HashMap::new()), }) } @@ -276,6 +305,40 @@ impl Vm { file.sync_all() .map_err(|err| MemoryBackingFile("sync_all", err)) } + + /// Register a device IRQ + pub fn register_irq(&self, fd: &EventFd, gsi: u32) -> Result<(), errno::Error> { + self.common.fd.register_irqfd(fd, gsi)?; + + let mut entry = kvm_irq_routing_entry { + gsi, + type_: KVM_IRQ_ROUTING_IRQCHIP, + ..Default::default() + }; + #[cfg(target_arch = "x86_64")] + { + entry.u.irqchip.irqchip = KVM_IRQCHIP_IOAPIC; + } + #[cfg(target_arch = "aarch64")] + { + entry.u.irqchip.irqchip = 0; + } + + entry.u.irqchip.pin = gsi; + + self.common + .interrupts + .lock() + .expect("Poisoned lock") + .insert( + gsi, + RoutingEntry { + entry, + masked: false, + }, + ); + Ok(()) + } } #[cfg(test)] From dc1f572924909724d0903c438f2a2eb4dc4c8043 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 3 Jun 2025 12:28:39 +0200 Subject: [PATCH 37/56] interrupts: add support for MSI/MSI-X interrupts Enable Vm to vend and manage MSI/MSI-X interrupts. This adds the logic to create a set of MSI vectors and then handle their lifetime. Signed-off-by: Babis Chalios --- src/vmm/src/vstate/vm.rs | 442 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 439 insertions(+), 3 deletions(-) diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index cf8879df033..47c3011f37d 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -9,19 +9,25 @@ use std::collections::HashMap; use std::fs::OpenOptions; use std::io::Write; use std::path::Path; +use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::{Arc, Mutex}; #[cfg(target_arch = "x86_64")] use kvm_bindings::KVM_IRQCHIP_IOAPIC; use kvm_bindings::{ - KVM_IRQ_ROUTING_IRQCHIP, KVM_MEM_LOG_DIRTY_PAGES, kvm_irq_routing_entry, - kvm_userspace_memory_region, + KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI, KVM_MEM_LOG_DIRTY_PAGES, KVM_MSI_VALID_DEVID, + KvmIrqRouting, kvm_irq_routing_entry, kvm_userspace_memory_region, }; use kvm_ioctls::VmFd; +use log::debug; +use vm_device::interrupt::{ + InterruptIndex, InterruptSourceConfig, InterruptSourceGroup, MsiIrqSourceConfig, +}; use vmm_sys_util::errno; use vmm_sys_util::eventfd::EventFd; pub use crate::arch::{ArchVm as Vm, ArchVmError, VmState}; +use crate::device_manager::resources::ResourceAllocator; use crate::logger::info; use crate::persist::CreateSnapshotError; use crate::utils::u64_to_usize; @@ -52,6 +58,148 @@ pub struct RoutingEntry { masked: bool, } +/// Type that describes an allocated interrupt +#[derive(Debug)] +pub struct MsiVector { + /// GSI used for this vector + pub gsi: u32, + /// EventFd used for this vector + pub event_fd: EventFd, + /// Flag determining whether the vector is enabled + pub enabled: AtomicBool, +} + +impl MsiVector { + /// Create a new [`MsiVector`] of a particular type + pub fn new(gsi: u32, enabled: bool) -> Result { + Ok(MsiVector { + gsi, + event_fd: EventFd::new(libc::EFD_NONBLOCK).map_err(InterruptError::EventFd)?, + enabled: AtomicBool::new(enabled), + }) + } +} + +impl MsiVector { + /// Enable vector + fn enable(&self, vmfd: &VmFd) -> Result<(), errno::Error> { + if !self.enabled.load(Ordering::Acquire) { + vmfd.register_irqfd(&self.event_fd, self.gsi)?; + self.enabled.store(true, Ordering::Release); + } + + Ok(()) + } + + /// Disable vector + fn disable(&self, vmfd: &VmFd) -> Result<(), errno::Error> { + if self.enabled.load(Ordering::Acquire) { + vmfd.unregister_irqfd(&self.event_fd, self.gsi)?; + self.enabled.store(false, Ordering::Release); + } + + Ok(()) + } +} + +#[derive(Debug)] +/// MSI interrupts created for a VirtIO device +pub struct MsiVectorGroup { + vm: Arc, + irq_routes: HashMap, +} + +impl MsiVectorGroup { + /// Returns the number of vectors in this group + pub fn num_vectors(&self) -> u16 { + // It is safe to unwrap here. We are creating `MsiVectorGroup` objects through the + // `Vm::create_msix_group` where the argument for the number of `irq_routes` is a `u16`. + u16::try_from(self.irq_routes.len()).unwrap() + } +} + +impl InterruptSourceGroup for MsiVectorGroup { + fn enable(&self) -> vm_device::interrupt::Result<()> { + for route in self.irq_routes.values() { + route.enable(&self.vm.common.fd)?; + } + + Ok(()) + } + + fn disable(&self) -> vm_device::interrupt::Result<()> { + for route in self.irq_routes.values() { + route.disable(&self.vm.common.fd)?; + } + + Ok(()) + } + + fn trigger(&self, index: InterruptIndex) -> vm_device::interrupt::Result<()> { + self.notifier(index) + .ok_or(std::io::Error::other(format!( + "trigger: invalid interrupt index {index}" + )))? + .write(1) + } + + fn notifier(&self, index: InterruptIndex) -> Option<&EventFd> { + self.irq_routes.get(&index).map(|route| &route.event_fd) + } + + fn update( + &self, + index: InterruptIndex, + config: InterruptSourceConfig, + masked: bool, + set_gsi: bool, + ) -> vm_device::interrupt::Result<()> { + let msi_config = match config { + InterruptSourceConfig::LegacyIrq(_) => { + return Err(std::io::Error::other( + "MSI-x update: invalid configuration type", + )); + } + InterruptSourceConfig::MsiIrq(config) => config, + }; + + if let Some(route) = self.irq_routes.get(&index) { + // When an interrupt is masked the GSI will not be passed to KVM through + // KVM_SET_GSI_ROUTING. So, call [`disable()`] to unregister the interrupt file + // descriptor before passing the interrupt routes to KVM + if masked { + route.disable(&self.vm.common.fd)?; + } + + self.vm.register_msi(route, masked, msi_config)?; + if set_gsi { + self.vm + .set_gsi_routes() + .map_err(|err| std::io::Error::other(format!("MSI-X update: {err}")))? + } + + // Assign KVM_IRQFD after KVM_SET_GSI_ROUTING to avoid + // panic on kernel which does not have commit a80ced6ea514 + // (KVM: SVM: fix panic on out-of-bounds guest IRQ). + if !masked { + route.enable(&self.vm.common.fd)?; + } + + return Ok(()); + } + + Err(std::io::Error::other(format!( + "MSI-X update: invalid vector index {index}" + ))) + } + + fn set_gsi(&self) -> vm_device::interrupt::Result<()> { + self.vm + .set_gsi_routes() + .map_err(|err| std::io::Error::other(format!("MSI-X update: {err}"))) + } +} + /// Architecture independent parts of a VM. #[derive(Debug)] pub struct VmCommon { @@ -323,7 +471,6 @@ impl Vm { { entry.u.irqchip.irqchip = 0; } - entry.u.irqchip.pin = gsi; self.common @@ -339,10 +486,89 @@ impl Vm { ); Ok(()) } + + /// Register an MSI device interrupt + pub fn register_msi( + &self, + route: &MsiVector, + masked: bool, + config: MsiIrqSourceConfig, + ) -> Result<(), errno::Error> { + let mut entry = kvm_irq_routing_entry { + gsi: route.gsi, + type_: KVM_IRQ_ROUTING_MSI, + ..Default::default() + }; + entry.u.msi.address_lo = config.low_addr; + entry.u.msi.address_hi = config.high_addr; + entry.u.msi.data = config.data; + + if self.common.fd.check_extension(kvm_ioctls::Cap::MsiDevid) { + // According to KVM documentation: + // https://docs.kernel.org/virt/kvm/api.html#kvm-set-gsi-routing + // + // if the capability is set, we need to set the flag and provide a valid unique device + // ID. "For PCI, this is usually a BDF identifier in the lower 16 bits". + // + // The layout of `config.devid` is: + // + // |---- 16 bits ----|-- 8 bits --|-- 5 bits --|-- 3 bits --| + // | segment | bus | device | function | + // + // For the time being, we are using a single PCI segment and a single bus per segment + // so just passing config.devid should be fine. + entry.flags = KVM_MSI_VALID_DEVID; + entry.u.msi.__bindgen_anon_1.devid = config.devid; + } + + self.common + .interrupts + .lock() + .expect("Poisoned lock") + .insert(route.gsi, RoutingEntry { entry, masked }); + + Ok(()) + } + + /// Create a group of MSI-X interrupts + pub fn create_msix_group( + vm: Arc, + resource_allocator: &ResourceAllocator, + count: u16, + ) -> Result { + debug!("Creating new MSI group with {count} vectors"); + let mut irq_routes = HashMap::with_capacity(count as usize); + for (gsi, i) in resource_allocator + .allocate_gsi(count as u32)? + .iter() + .zip(0u32..) + { + irq_routes.insert(i, MsiVector::new(*gsi, false)?); + } + + Ok(MsiVectorGroup { vm, irq_routes }) + } + + /// Set GSI routes to KVM + pub fn set_gsi_routes(&self) -> Result<(), InterruptError> { + let entries = self.common.interrupts.lock().expect("Poisoned lock"); + let mut routes = KvmIrqRouting::new(0)?; + + for entry in entries.values() { + if entry.masked { + continue; + } + routes.push(entry.entry)?; + } + + self.common.fd.set_gsi_routing(&routes)?; + Ok(()) + } } #[cfg(test)] pub(crate) mod tests { + use vm_device::interrupt::{InterruptSourceConfig, LegacyIrqSourceConfig}; use vm_memory::GuestAddress; use vm_memory::mmap::MmapRegionBuilder; @@ -454,4 +680,214 @@ pub(crate) mod tests { assert_eq!(vcpu_vec.len(), vcpu_count as usize); } + + fn enable_irqchip(vm: &mut Vm) { + #[cfg(target_arch = "x86_64")] + vm.setup_irqchip().unwrap(); + #[cfg(target_arch = "aarch64")] + vm.setup_irqchip(1).unwrap(); + } + + fn create_msix_group(vm: &Arc) -> MsiVectorGroup { + let resource_allocator = ResourceAllocator::new().unwrap(); + Vm::create_msix_group(vm.clone(), &resource_allocator, 4).unwrap() + } + + #[test] + fn test_msi_vector_group_new() { + let (_, vm) = setup_vm_with_memory(mib_to_bytes(128)); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + assert_eq!(msix_group.num_vectors(), 4); + } + + #[test] + fn test_msi_vector_group_enable_disable() { + let (_, mut vm) = setup_vm_with_memory(mib_to_bytes(128)); + enable_irqchip(&mut vm); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + + // Initially all vectors are disabled + for route in msix_group.irq_routes.values() { + assert!(!route.enabled.load(Ordering::Acquire)) + } + + // Enable works + msix_group.enable().unwrap(); + for route in msix_group.irq_routes.values() { + assert!(route.enabled.load(Ordering::Acquire)); + } + // Enabling an enabled group doesn't error out + msix_group.enable().unwrap(); + + // Disable works + msix_group.disable().unwrap(); + for route in msix_group.irq_routes.values() { + assert!(!route.enabled.load(Ordering::Acquire)) + } + // Disabling a disabled group doesn't error out + } + + #[test] + fn test_msi_vector_group_trigger() { + let (_, mut vm) = setup_vm_with_memory(mib_to_bytes(128)); + enable_irqchip(&mut vm); + + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + + // We can now trigger all vectors + for i in 0..4 { + msix_group.trigger(i).unwrap() + } + + // We can't trigger an invalid vector + msix_group.trigger(4).unwrap_err(); + } + + #[test] + fn test_msi_vector_group_notifier() { + let (_, vm) = setup_vm_with_memory(mib_to_bytes(128)); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + + for i in 0..4 { + assert!(msix_group.notifier(i).is_some()); + } + + assert!(msix_group.notifier(4).is_none()); + } + + #[test] + fn test_msi_vector_group_update_wrong_config() { + let (_, vm) = setup_vm_with_memory(mib_to_bytes(128)); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + let irq_config = LegacyIrqSourceConfig { irqchip: 0, pin: 0 }; + msix_group + .update(0, InterruptSourceConfig::LegacyIrq(irq_config), true, true) + .unwrap_err(); + } + + #[test] + fn test_msi_vector_group_update_invalid_vector() { + let (_, mut vm) = setup_vm_with_memory(mib_to_bytes(128)); + enable_irqchip(&mut vm); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + let config = InterruptSourceConfig::MsiIrq(MsiIrqSourceConfig { + high_addr: 0x42, + low_addr: 0x12, + data: 0x12, + devid: 0xafa, + }); + msix_group.update(0, config, true, true).unwrap(); + msix_group.update(4, config, true, true).unwrap_err(); + } + + #[test] + fn test_msi_vector_group_update() { + let (_, mut vm) = setup_vm_with_memory(mib_to_bytes(128)); + enable_irqchip(&mut vm); + let vm = Arc::new(vm); + assert!(vm.common.interrupts.lock().unwrap().is_empty()); + let msix_group = create_msix_group(&vm); + + // Set some configuration for the vectors. Initially all are masked + let mut config = MsiIrqSourceConfig { + high_addr: 0x42, + low_addr: 0x13, + data: 0x12, + devid: 0xafa, + }; + for i in 0..4 { + config.data = 0x12 * i; + msix_group + .update(i, InterruptSourceConfig::MsiIrq(config), true, false) + .unwrap(); + } + + // All vectors should be disabled + for vector in msix_group.irq_routes.values() { + assert!(!vector.enabled.load(Ordering::Acquire)); + } + + for i in 0..4 { + let gsi = crate::arch::IRQ_BASE + i; + let interrupts = vm.common.interrupts.lock().unwrap(); + let kvm_route = interrupts.get(&gsi).unwrap(); + assert!(kvm_route.masked); + assert_eq!(kvm_route.entry.gsi, gsi); + assert_eq!(kvm_route.entry.type_, KVM_IRQ_ROUTING_MSI); + // SAFETY: because we know we setup MSI routes. + unsafe { + assert_eq!(kvm_route.entry.u.msi.address_hi, 0x42); + assert_eq!(kvm_route.entry.u.msi.address_lo, 0x13); + assert_eq!(kvm_route.entry.u.msi.data, 0x12 * i); + } + } + + // Simply enabling the vectors should not update the registered IRQ routes + msix_group.enable().unwrap(); + for i in 0..4 { + let gsi = crate::arch::IRQ_BASE + i; + let interrupts = vm.common.interrupts.lock().unwrap(); + let kvm_route = interrupts.get(&gsi).unwrap(); + assert!(kvm_route.masked); + assert_eq!(kvm_route.entry.gsi, gsi); + assert_eq!(kvm_route.entry.type_, KVM_IRQ_ROUTING_MSI); + // SAFETY: because we know we setup MSI routes. + unsafe { + assert_eq!(kvm_route.entry.u.msi.address_hi, 0x42); + assert_eq!(kvm_route.entry.u.msi.address_lo, 0x13); + assert_eq!(kvm_route.entry.u.msi.data, 0x12 * i); + } + } + + // Updating the config of a vector should enable its route (and only its route) + config.data = 0; + msix_group + .update(0, InterruptSourceConfig::MsiIrq(config), false, true) + .unwrap(); + for i in 0..4 { + let gsi = crate::arch::IRQ_BASE + i; + let interrupts = vm.common.interrupts.lock().unwrap(); + let kvm_route = interrupts.get(&gsi).unwrap(); + assert_eq!(kvm_route.masked, i != 0); + assert_eq!(kvm_route.entry.gsi, gsi); + assert_eq!(kvm_route.entry.type_, KVM_IRQ_ROUTING_MSI); + // SAFETY: because we know we setup MSI routes. + unsafe { + assert_eq!(kvm_route.entry.u.msi.address_hi, 0x42); + assert_eq!(kvm_route.entry.u.msi.address_lo, 0x13); + assert_eq!(kvm_route.entry.u.msi.data, 0x12 * i); + } + } + } + + #[cfg(target_arch = "x86_64")] + #[test] + fn test_msi_vector_group_set_gsi_without_ioapic() { + // Setting GSI routes without IOAPIC setup should fail on x86. Apparently, it doesn't fail + // on Aarch64 + let (_, vm) = setup_vm_with_memory(mib_to_bytes(128)); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + let err = msix_group.set_gsi().unwrap_err(); + assert_eq!( + format!("{err}"), + "MSI-X update: KVM error: Invalid argument (os error 22)" + ); + } + + #[test] + fn test_msi_vector_group_set_gsi() { + let (_, mut vm) = setup_vm_with_memory(mib_to_bytes(128)); + enable_irqchip(&mut vm); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + + msix_group.set_gsi().unwrap(); + } } From 4fd4f442d2ded33c742934de8fa0058358c93ed0 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 10 Jun 2025 18:46:23 +0200 Subject: [PATCH 38/56] vstate: support serializing interrupts to snapshots Vm object is now maintaining information about the interrupts (both traditional IRQs and MSI-X vectors) that are being used by microVM devices. Derive Serialize/Deserialize add logic for recreating objects for relevant types. Signed-off-by: Babis Chalios --- src/vmm/src/vstate/vm.rs | 58 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index 47c3011f37d..950bcac652d 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -20,6 +20,7 @@ use kvm_bindings::{ }; use kvm_ioctls::VmFd; use log::debug; +use serde::{Deserialize, Serialize}; use vm_device::interrupt::{ InterruptIndex, InterruptSourceConfig, InterruptSourceGroup, MsiIrqSourceConfig, }; @@ -30,6 +31,7 @@ pub use crate::arch::{ArchVm as Vm, ArchVmError, VmState}; use crate::device_manager::resources::ResourceAllocator; use crate::logger::info; use crate::persist::CreateSnapshotError; +use crate::snapshot::Persist; use crate::utils::u64_to_usize; use crate::vmm_config::snapshot::SnapshotType; use crate::vstate::memory::{ @@ -51,7 +53,7 @@ pub enum InterruptError { Kvm(#[from] kvm_ioctls::Error), } -#[derive(Debug)] +#[derive(Debug, Serialize, Deserialize)] /// A struct representing an interrupt line used by some device of the microVM pub struct RoutingEntry { entry: kvm_irq_routing_entry, @@ -118,6 +120,38 @@ impl MsiVectorGroup { } } +impl<'a> Persist<'a> for MsiVectorGroup { + type State = HashMap; + type ConstructorArgs = Arc; + type Error = InterruptError; + + fn save(&self) -> Self::State { + // We don't save the "enabled" state of the MSI interrupt. PCI devices store the MSI-X + // configuration and make sure that the vector is enabled during the restore path if it was + // initially enabled + self.irq_routes + .iter() + .map(|(id, route)| (*id, route.gsi)) + .collect() + } + + fn restore( + constructor_args: Self::ConstructorArgs, + state: &Self::State, + ) -> std::result::Result { + let mut irq_routes = HashMap::new(); + + for (id, gsi) in state { + irq_routes.insert(*id, MsiVector::new(*gsi, false)?); + } + + Ok(MsiVectorGroup { + vm: constructor_args, + irq_routes, + }) + } +} + impl InterruptSourceGroup for MsiVectorGroup { fn enable(&self) -> vm_device::interrupt::Result<()> { for route in self.irq_routes.values() { @@ -890,4 +924,26 @@ pub(crate) mod tests { msix_group.set_gsi().unwrap(); } + + #[test] + fn test_msi_vector_group_persistence() { + let (_, mut vm) = setup_vm_with_memory(mib_to_bytes(128)); + enable_irqchip(&mut vm); + let vm = Arc::new(vm); + let msix_group = create_msix_group(&vm); + + msix_group.enable().unwrap(); + let state = msix_group.save(); + let restored_group = MsiVectorGroup::restore(vm, &state).unwrap(); + + assert_eq!(msix_group.num_vectors(), restored_group.num_vectors()); + // Even if an MSI group is enabled, we don't save it as such. During restoration, the PCI + // transport will make sure the correct config is set for the vectors and enable them + // accordingly. + for (id, vector) in msix_group.irq_routes { + let new_vector = restored_group.irq_routes.get(&id).unwrap(); + assert_eq!(vector.gsi, new_vector.gsi); + assert!(!new_vector.enabled.load(Ordering::Acquire)); + } + } } From 38d5271b9b2ff5a50faa6c3ca93d9fdcb7617ace Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 3 Jun 2025 18:10:50 +0200 Subject: [PATCH 39/56] virtio: initialize queue size with max_size Apparently, PCI needs Queue::size to be initialized to the maximum possible size supported by the device, otherwise initialization fails. Signed-off-by: Babis Chalios --- src/vmm/src/devices/virtio/queue.rs | 2 +- src/vmm/src/devices/virtio/transport/mmio.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/vmm/src/devices/virtio/queue.rs b/src/vmm/src/devices/virtio/queue.rs index 9977070293e..79c635e5c4d 100644 --- a/src/vmm/src/devices/virtio/queue.rs +++ b/src/vmm/src/devices/virtio/queue.rs @@ -280,7 +280,7 @@ impl Queue { pub fn new(max_size: u16) -> Queue { Queue { max_size, - size: 0, + size: max_size, ready: false, desc_table_address: GuestAddress(0), avail_ring_address: GuestAddress(0), diff --git a/src/vmm/src/devices/virtio/transport/mmio.rs b/src/vmm/src/devices/virtio/transport/mmio.rs index 5ecc3fa8ffe..3a8aa1ad42e 100644 --- a/src/vmm/src/devices/virtio/transport/mmio.rs +++ b/src/vmm/src/devices/virtio/transport/mmio.rs @@ -787,7 +787,7 @@ pub(crate) mod tests { assert_eq!(d.queue_select, 3); d.queue_select = 0; - assert_eq!(d.locked_device().queues()[0].size, 0); + assert_eq!(d.locked_device().queues()[0].size, 16); write_le_u32(&mut buf[..], 16); d.write(0x0, 0x38, &buf[..]); assert_eq!(d.locked_device().queues()[0].size, 16); From 0f79f639c64e9c74c171139370bb6145dcc9749c Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 3 Jun 2025 18:13:08 +0200 Subject: [PATCH 40/56] acpi: PCI compatible flags in FADT Remove the flags in FADT that were declaring we do not support MSI and PCI ASPM. Signed-off-by: Babis Chalios --- src/vmm/src/acpi/x86_64.rs | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/src/vmm/src/acpi/x86_64.rs b/src/vmm/src/acpi/x86_64.rs index de850a9989f..53eeac7b5e2 100644 --- a/src/vmm/src/acpi/x86_64.rs +++ b/src/vmm/src/acpi/x86_64.rs @@ -3,10 +3,7 @@ use std::mem::size_of; -use acpi_tables::fadt::{ - IAPC_BOOT_ARG_FLAGS_MSI_NOT_PRESENT, IAPC_BOOT_ARG_FLAGS_PCI_ASPM, - IAPC_BOOT_ARG_FLAGS_VGA_NOT_PRESENT, -}; +use acpi_tables::fadt::IAPC_BOOT_ARG_FLAGS_VGA_NOT_PRESENT; use acpi_tables::madt::{IoAPIC, LocalAPIC}; use acpi_tables::{Fadt, aml}; use vm_memory::GuestAddress; @@ -33,11 +30,7 @@ pub(crate) fn setup_arch_fadt(fadt: &mut Fadt) { // neither do we support ASPM, or MSI type of interrupts. // More info here: // https://uefi.org/specs/ACPI/6.5/05_ACPI_Software_Programming_Model.html?highlight=0a06#ia-pc-boot-architecture-flags - fadt.setup_iapc_flags( - (1 << IAPC_BOOT_ARG_FLAGS_VGA_NOT_PRESENT) - | (1 << IAPC_BOOT_ARG_FLAGS_PCI_ASPM) - | (1 << IAPC_BOOT_ARG_FLAGS_MSI_NOT_PRESENT), - ); + fadt.setup_iapc_flags(1 << IAPC_BOOT_ARG_FLAGS_VGA_NOT_PRESENT); } #[inline(always)] From f62629a0ad586b551dacd8e83a6d629c82ff54de Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 3 Jun 2025 14:05:42 +0200 Subject: [PATCH 41/56] vmm: simplify device errors Merge the device-related errors that DeviceManager might return. This way, we can avoid adding yet another error type for PCI devices and reduce some the variants of StartMicrovmError. Suggested-by: Egor Lazarchuk Signed-off-by: Babis Chalios --- src/vmm/src/builder.rs | 25 ++++++++++--------------- src/vmm/src/device_manager/mod.rs | 31 +++++++++---------------------- 2 files changed, 19 insertions(+), 37 deletions(-) diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index b0712abc3a5..8bb4dff867b 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -22,12 +22,12 @@ use crate::arch::{ConfigurationError, configure_system_for_boot, load_kernel}; #[cfg(target_arch = "aarch64")] use crate::construct_kvm_mpidrs; use crate::cpu_config::templates::{GetCpuTemplate, GetCpuTemplateError, GuestConfigError}; -#[cfg(target_arch = "aarch64")] -use crate::device_manager::AttachLegacyMmioDeviceError; +#[cfg(target_arch = "x86_64")] +use crate::device_manager; use crate::device_manager::pci_mngr::PciManagerError; use crate::device_manager::{ - AttachMmioDeviceError, AttachVmgenidError, DeviceManager, DeviceManagerCreateError, - DevicePersistError, DeviceRestoreArgs, + AttachDeviceError, DeviceManager, DeviceManagerCreateError, DevicePersistError, + DeviceRestoreArgs, }; use crate::devices::acpi::vmgenid::VmGenIdError; use crate::devices::virtio::balloon::Balloon; @@ -48,18 +48,15 @@ use crate::vstate::kvm::{Kvm, KvmError}; use crate::vstate::memory::GuestRegionMmap; use crate::vstate::vcpu::VcpuError; use crate::vstate::vm::{Vm, VmError}; -use crate::{EventManager, Vmm, VmmError, device_manager}; +use crate::{EventManager, Vmm, VmmError}; /// Errors associated with starting the instance. #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum StartMicrovmError { /// Unable to attach block device to Vmm: {0} AttachBlockDevice(io::Error), - /// Unable to attach the VMGenID device: {0} - AttachVmgenidDevice(#[from] AttachVmgenidError), - #[cfg(target_arch = "aarch64")] - /// Unable to attach legacy MMIO devices: {0} - AttachLegacyDevices(#[from] AttachLegacyMmioDeviceError), + /// Could not attach device: {0} + AttachDevice(#[from] AttachDeviceError), /// System configuration error: {0} ConfigureSystem(#[from] ConfigurationError), /// Failed to create device manager: {0} @@ -104,8 +101,6 @@ pub enum StartMicrovmError { NetDeviceNotConfigured, /// Cannot open the block device backing file: {0} OpenBlockDevice(io::Error), - /// Cannot initialize a MMIO Device or add a device to the MMIO Bus or cmdline: {0} - RegisterMmioDevice(#[from] device_manager::AttachMmioDeviceError), /// Cannot restore microvm state: {0} RestoreMicrovmState(MicrovmStateError), /// Cannot set vm resources: {0} @@ -563,7 +558,7 @@ fn attach_entropy_device( cmdline: &mut LoaderKernelCmdline, entropy_device: &Arc>, event_manager: &mut EventManager, -) -> Result<(), AttachMmioDeviceError> { +) -> Result<(), AttachDeviceError> { let id = entropy_device .lock() .expect("Poisoned lock") @@ -625,7 +620,7 @@ fn attach_unixsock_vsock_device( cmdline: &mut LoaderKernelCmdline, unix_vsock: &Arc>>, event_manager: &mut EventManager, -) -> Result<(), AttachMmioDeviceError> { +) -> Result<(), AttachDeviceError> { let id = String::from(unix_vsock.lock().expect("Poisoned lock").id()); event_manager.add_subscriber(unix_vsock.clone()); // The device mutex mustn't be locked here otherwise it will deadlock. @@ -638,7 +633,7 @@ fn attach_balloon_device( cmdline: &mut LoaderKernelCmdline, balloon: &Arc>, event_manager: &mut EventManager, -) -> Result<(), AttachMmioDeviceError> { +) -> Result<(), AttachDeviceError> { let id = String::from(balloon.lock().expect("Poisoned lock").id()); event_manager.add_subscriber(balloon.clone()); // The device mutex mustn't be locked here otherwise it will deadlock. diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index a60a86ea7c3..8df4da2863d 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -64,34 +64,21 @@ pub enum DeviceManagerCreateError { #[derive(Debug, thiserror::Error, displaydoc::Display)] /// Error while attaching a VirtIO device -pub enum AttachMmioDeviceError { +pub enum AttachDeviceError { /// MMIO transport error: {0} MmioTransport(#[from] MmioError), /// Error inserting device in bus: {0} Bus(#[from] vm_device::BusError), -} - -#[derive(Debug, thiserror::Error, displaydoc::Display)] -/// Error while attaching the VMGenID device -pub enum AttachVmgenidError { /// Error creating VMGenID device: {0} CreateVmGenID(#[from] VmGenIdError), /// Error while registering VMGenID with KVM: {0} AttachVmGenID(#[from] kvm_ioctls::Error), -} - -#[cfg(target_arch = "aarch64")] -#[derive(Debug, thiserror::Error, displaydoc::Display)] -/// Error while attaching the VMGenID device -pub enum AttachLegacyMmioDeviceError { + #[cfg(target_arch = "aarch64")] /// Cmdline error Cmdline, + #[cfg(target_arch = "aarch64")] /// Error creating serial device: {0} CreateSerial(#[from] std::io::Error), - /// Error registering device: {0} - RegisterMMIODevice(#[from] MmioError), - /// Error inserting device in the Bus: {0} - Bus(#[from] vm_device::BusError), } #[derive(Debug)] @@ -181,7 +168,7 @@ impl DeviceManager { device: Arc>, cmdline: &mut Cmdline, is_vhost_user: bool, - ) -> Result<(), AttachMmioDeviceError> { + ) -> Result<(), AttachDeviceError> { let interrupt = Arc::new(IrqTrigger::new()); // The device mutex mustn't be locked here otherwise it will deadlock. let device = @@ -201,7 +188,7 @@ impl DeviceManager { pub(crate) fn attach_boot_timer_device( &mut self, request_ts: TimestampUs, - ) -> Result<(), AttachMmioDeviceError> { + ) -> Result<(), AttachDeviceError> { let boot_timer = Arc::new(Mutex::new(BootTimer::new(request_ts))); self.mmio_devices @@ -214,7 +201,7 @@ impl DeviceManager { &mut self, mem: &GuestMemoryMmap, vm: &Vm, - ) -> Result<(), AttachVmgenidError> { + ) -> Result<(), AttachDeviceError> { let vmgenid = VmGenId::new(mem, &self.resource_allocator)?; self.acpi_devices.attach_vmgenid(vmgenid, vm)?; Ok(()) @@ -226,13 +213,13 @@ impl DeviceManager { vm: &Vm, event_manager: &mut EventManager, cmdline: &mut Cmdline, - ) -> Result<(), AttachLegacyMmioDeviceError> { + ) -> Result<(), AttachDeviceError> { // Serial device setup. let cmdline_contains_console = cmdline .as_cstring() - .map_err(|_| AttachLegacyMmioDeviceError::Cmdline)? + .map_err(|_| AttachDeviceError::Cmdline)? .into_string() - .map_err(|_| AttachLegacyMmioDeviceError::Cmdline)? + .map_err(|_| AttachDeviceError::Cmdline)? .contains("console="); if cmdline_contains_console { From 92ea97ffb3ee28ae2b0962a6cfbe0eff9bded8cb Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Mon, 2 Jun 2025 20:48:16 +0200 Subject: [PATCH 42/56] pci: add virtio-pci transport implementation Add a VirtIO PCI transport implementation. When a Firecracker microVM is launched with --enable-pci, we will create all VirtIO devices using the PCI transport layer. Snapshotting of VirtIO PCI devices is not supported and we will add this functionality in later commit. Add a couple of tests that ensure that PCI configuration space is what expected. We read common fields and make sure the BAR we allocate for the VirtIO device is what expected. Signed-off-by: Babis Chalios --- Cargo.lock | 8 + src/vmm/Cargo.toml | 2 + src/vmm/src/builder.rs | 14 +- src/vmm/src/device_manager/mod.rs | 27 +- src/vmm/src/device_manager/pci_mngr.rs | 131 +- src/vmm/src/devices/virtio/device.rs | 2 +- src/vmm/src/devices/virtio/queue.rs | 13 + src/vmm/src/devices/virtio/transport/mod.rs | 2 + .../virtio/transport/pci/common_config.rs | 415 ++++++ .../devices/virtio/transport/pci/device.rs | 1279 +++++++++++++++++ .../src/devices/virtio/transport/pci/mod.rs | 5 + 11 files changed, 1887 insertions(+), 11 deletions(-) create mode 100644 src/vmm/src/devices/virtio/transport/pci/common_config.rs create mode 100644 src/vmm/src/devices/virtio/transport/pci/device.rs create mode 100644 src/vmm/src/devices/virtio/transport/pci/mod.rs diff --git a/Cargo.lock b/Cargo.lock index 3b5aa2637bd..d0873fb7346 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -112,6 +112,12 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "anyhow" +version = "1.0.98" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" + [[package]] name = "arrayvec" version = "0.7.6" @@ -1654,11 +1660,13 @@ version = "0.1.0" dependencies = [ "acpi_tables", "aes-gcm", + "anyhow", "arrayvec", "aws-lc-rs", "base64", "bincode", "bitflags 2.9.1", + "byteorder", "crc64", "criterion", "derive_more", diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index bee0f88efa8..688d44b660d 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -17,11 +17,13 @@ gdb = ["arrayvec", "gdbstub", "gdbstub_arch"] acpi_tables = { path = "../acpi-tables" } aes-gcm = { version = "0.10.1", default-features = false, features = ["aes"] } +anyhow = "1.0.98" arrayvec = { version = "0.7.6", optional = true } aws-lc-rs = { version = "1.13.1", features = ["bindgen"] } base64 = "0.22.1" bincode = { version = "2.0.1", features = ["serde"] } bitflags = "2.9.1" +byteorder = "1.5.0" crc64 = "2.0.0" derive_more = { version = "2.0.1", default-features = false, features = [ "from", diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 8bb4dff867b..5a255f5cf7b 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -169,6 +169,8 @@ pub fn build_microvm_for_boot( let mut device_manager = DeviceManager::new(event_manager, &vcpus_exit_evt, &vm)?; + let vm = Arc::new(vm); + let entry_point = load_kernel(&boot_config.kernel_file, vm.guest_memory())?; let initrd = InitrdConfig::from_config(boot_config, vm.guest_memory())?; @@ -271,7 +273,7 @@ pub fn build_microvm_for_boot( instance_info: instance_info.clone(), shutdown_exit_code: None, kvm, - vm: Arc::new(vm), + vm, uffd: None, vcpus_handles: Vec::new(), vcpus_exit_evt, @@ -554,7 +556,7 @@ fn setup_pvtime( fn attach_entropy_device( device_manager: &mut DeviceManager, - vm: &Vm, + vm: &Arc, cmdline: &mut LoaderKernelCmdline, entropy_device: &Arc>, event_manager: &mut EventManager, @@ -571,7 +573,7 @@ fn attach_entropy_device( fn attach_block_devices<'a, I: Iterator>> + Debug>( device_manager: &mut DeviceManager, - vm: &Vm, + vm: &Arc, cmdline: &mut LoaderKernelCmdline, blocks: I, event_manager: &mut EventManager, @@ -600,7 +602,7 @@ fn attach_block_devices<'a, I: Iterator>> + Debug>( fn attach_net_devices<'a, I: Iterator>> + Debug>( device_manager: &mut DeviceManager, - vm: &Vm, + vm: &Arc, cmdline: &mut LoaderKernelCmdline, net_devices: I, event_manager: &mut EventManager, @@ -616,7 +618,7 @@ fn attach_net_devices<'a, I: Iterator>> + Debug>( fn attach_unixsock_vsock_device( device_manager: &mut DeviceManager, - vm: &Vm, + vm: &Arc, cmdline: &mut LoaderKernelCmdline, unix_vsock: &Arc>>, event_manager: &mut EventManager, @@ -629,7 +631,7 @@ fn attach_unixsock_vsock_device( fn attach_balloon_device( device_manager: &mut DeviceManager, - vm: &Vm, + vm: &Arc, cmdline: &mut LoaderKernelCmdline, balloon: &Arc>, event_manager: &mut EventManager, diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index 8df4da2863d..da61db922c3 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -79,6 +79,8 @@ pub enum AttachDeviceError { #[cfg(target_arch = "aarch64")] /// Error creating serial device: {0} CreateSerial(#[from] std::io::Error), + /// Error attach PCI device: {0} + PciTransport(#[from] PciManagerError), } #[derive(Debug)] @@ -160,8 +162,10 @@ impl DeviceManager { }) } - /// Attaches a VirtioDevice device to the device manager and event manager. - pub(crate) fn attach_virtio_device( + /// Attaches an MMIO VirtioDevice device to the device manager and event manager. + pub(crate) fn attach_mmio_virtio_device< + T: 'static + VirtioDevice + MutEventSubscriber + Debug, + >( &mut self, vm: &Vm, id: String, @@ -184,6 +188,25 @@ impl DeviceManager { Ok(()) } + /// Attaches a VirtioDevice device to the device manager and event manager. + pub(crate) fn attach_virtio_device( + &mut self, + vm: &Arc, + id: String, + device: Arc>, + cmdline: &mut Cmdline, + is_vhost_user: bool, + ) -> Result<(), AttachDeviceError> { + if self.pci_devices.pci_segment.is_some() { + self.pci_devices + .attach_pci_virtio_device(vm, &self.resource_allocator, id, device)?; + } else { + self.attach_mmio_virtio_device(vm, id, device, cmdline, is_vhost_user)?; + } + + Ok(()) + } + /// Attaches a [`BootTimer`] to the VM pub(crate) fn attach_boot_timer_device( &mut self, diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs index e9ada60cc1f..686349858fb 100644 --- a/src/vmm/src/device_manager/pci_mngr.rs +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -1,18 +1,29 @@ // Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 -use std::sync::Arc; +use std::collections::HashMap; +use std::fmt::Debug; +use std::sync::{Arc, Mutex}; +use event_manager::MutEventSubscriber; +use log::debug; +use pci::{PciBarRegionType, PciDevice, PciDeviceError, PciRootError}; use serde::{Deserialize, Serialize}; use vm_device::BusError; -use super::resources::ResourceAllocator; +use crate::Vm; +use crate::device_manager::resources::ResourceAllocator; use crate::devices::pci::PciSegment; +use crate::devices::virtio::device::VirtioDevice; +use crate::devices::virtio::transport::pci::device::{VirtioPciDevice, VirtioPciDeviceError}; +use crate::vstate::vm::InterruptError; #[derive(Debug, Default)] pub struct PciDevices { /// PCIe segment of the VMM, if PCI is enabled. We currently support a single PCIe segment. pub pci_segment: Option, + /// All VirtIO PCI devices of the system + pub virtio_devices: HashMap<(u32, String), Arc>>, } #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -21,6 +32,16 @@ pub enum PciManagerError { ResourceAllocation(#[from] vm_allocator::Error), /// Bus error: {0} Bus(#[from] BusError), + /// PCI root error: {0} + PciRoot(#[from] PciRootError), + /// MSI error: {0} + Msi(#[from] InterruptError), + /// VirtIO PCI device error: {0} + VirtioPciDevice(#[from] VirtioPciDeviceError), + /// PCI device error: {0} + PciDeviceError(#[from] PciDeviceError), + /// KVM error: {0} + Kvm(#[from] vmm_sys_util::errno::Error), } impl PciDevices { @@ -61,6 +82,112 @@ impl PciDevices { Ok(()) } + + fn register_bars_with_bus( + resource_allocator: &ResourceAllocator, + virtio_device: &Arc>, + ) -> Result<(), PciManagerError> { + for bar in &virtio_device.lock().expect("Poisoned lock").bar_regions { + match bar.region_type() { + PciBarRegionType::IoRegion => { + debug!( + "Inserting I/O BAR region: {:#x}:{:#x}", + bar.addr(), + bar.size() + ); + #[cfg(target_arch = "x86_64")] + resource_allocator.pio_bus.insert( + virtio_device.clone(), + bar.addr(), + bar.size(), + )?; + #[cfg(target_arch = "aarch64")] + log::error!("pci: We do not support I/O region allocation") + } + PciBarRegionType::Memory32BitRegion | PciBarRegionType::Memory64BitRegion => { + debug!( + "Inserting MMIO BAR region: {:#x}:{:#x}", + bar.addr(), + bar.size() + ); + resource_allocator.mmio_bus.insert( + virtio_device.clone(), + bar.addr(), + bar.size(), + )?; + } + } + } + + Ok(()) + } + + pub(crate) fn attach_pci_virtio_device< + T: 'static + VirtioDevice + MutEventSubscriber + Debug, + >( + &mut self, + vm: &Arc, + resource_allocator: &ResourceAllocator, + id: String, + device: Arc>, + ) -> Result<(), PciManagerError> { + // We should only be reaching this point if PCI is enabled + let pci_segment = self.pci_segment.as_ref().unwrap(); + let pci_device_bdf = pci_segment.next_device_bdf()?; + debug!("Allocating BDF: {pci_device_bdf:?} for device"); + let mem = vm.guest_memory().clone(); + + // Allocate one MSI vector per queue, plus one for configuration + let msix_num = + u16::try_from(device.lock().expect("Poisoned lock").queues().len() + 1).unwrap(); + + let msix_vectors = Arc::new(Vm::create_msix_group( + vm.clone(), + resource_allocator, + msix_num, + )?); + + // Create the transport + let mut virtio_device = + VirtioPciDevice::new(id.clone(), mem, device, msix_vectors, pci_device_bdf.into())?; + + // Allocate bars + let mut mmio32_allocator = resource_allocator + .mmio32_memory + .lock() + .expect("Poisoned lock"); + let mut mmio64_allocator = resource_allocator + .mmio64_memory + .lock() + .expect("Poisoned lock"); + + virtio_device.allocate_bars(&mut mmio32_allocator, &mut mmio64_allocator, None)?; + + let virtio_device = Arc::new(Mutex::new(virtio_device)); + pci_segment + .pci_bus + .lock() + .expect("Poisoned lock") + .add_device(pci_device_bdf.device() as u32, virtio_device.clone())?; + + Self::register_bars_with_bus(resource_allocator, &virtio_device)?; + virtio_device + .lock() + .expect("Poisoned lock") + .register_notification_ioevent(vm)?; + + Ok(()) + } + + /// Gets the specified device. + pub fn get_virtio_device( + &self, + device_type: u32, + device_id: &str, + ) -> Option<&Arc>> { + self.virtio_devices + .get(&(device_type, device_id.to_string())) + } } #[derive(Default, Debug, Clone, Serialize, Deserialize)] diff --git a/src/vmm/src/devices/virtio/device.rs b/src/vmm/src/devices/virtio/device.rs index 0b09195d8f7..7b51a4b1dbf 100644 --- a/src/vmm/src/devices/virtio/device.rs +++ b/src/vmm/src/devices/virtio/device.rs @@ -148,7 +148,7 @@ pub trait VirtioDevice: AsAny + Send { /// Optionally deactivates this device and returns ownership of the guest memory map, interrupt /// event, and queue events. - fn reset(&mut self) -> Option<(EventFd, Vec)> { + fn reset(&mut self) -> Option<(Arc, Vec)> { None } diff --git a/src/vmm/src/devices/virtio/queue.rs b/src/vmm/src/devices/virtio/queue.rs index 79c635e5c4d..7fd862f45ca 100644 --- a/src/vmm/src/devices/virtio/queue.rs +++ b/src/vmm/src/devices/virtio/queue.rs @@ -669,6 +669,19 @@ impl Queue { new - used_event - Wrapping(1) < new - old } + + /// Resets the Virtio Queue + pub(crate) fn reset(&mut self) { + self.ready = false; + self.size = self.max_size; + self.desc_table_address = GuestAddress(0); + self.avail_ring_address = GuestAddress(0); + self.used_ring_address = GuestAddress(0); + self.next_avail = Wrapping(0); + self.next_used = Wrapping(0); + self.num_added = Wrapping(0); + self.uses_notif_suppression = false; + } } #[cfg(kani)] diff --git a/src/vmm/src/devices/virtio/transport/mod.rs b/src/vmm/src/devices/virtio/transport/mod.rs index d41ad943aa2..c16a7adbe9d 100644 --- a/src/vmm/src/devices/virtio/transport/mod.rs +++ b/src/vmm/src/devices/virtio/transport/mod.rs @@ -8,6 +8,8 @@ use vmm_sys_util::eventfd::EventFd; /// MMIO transport for VirtIO devices pub mod mmio; +/// PCI transport for VirtIO devices +pub mod pci; /// Represents the types of interrupts used by VirtIO devices #[derive(Debug, Clone)] diff --git a/src/vmm/src/devices/virtio/transport/pci/common_config.rs b/src/vmm/src/devices/virtio/transport/pci/common_config.rs new file mode 100644 index 00000000000..c8ee2d1d2a9 --- /dev/null +++ b/src/vmm/src/devices/virtio/transport/pci/common_config.rs @@ -0,0 +1,415 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +use std::sync::atomic::{AtomicU16, Ordering}; +use std::sync::{Arc, Mutex}; + +use byteorder::{ByteOrder, LittleEndian}; +use serde::{Deserialize, Serialize}; +use vm_memory::GuestAddress; + +use crate::devices::virtio::device::VirtioDevice; +use crate::devices::virtio::queue::Queue; +use crate::logger::{debug, error, info, trace, warn}; +pub const VIRTIO_PCI_COMMON_CONFIG_ID: &str = "virtio_pci_common_config"; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VirtioPciCommonConfigState { + pub driver_status: u8, + pub config_generation: u8, + pub device_feature_select: u32, + pub driver_feature_select: u32, + pub queue_select: u16, + pub msix_config: u16, + pub msix_queues: Vec, +} + +// The standard layout for the ring is a continuous chunk of memory which looks +// like this. We assume num is a power of 2. +// +// struct vring +// { +// // The actual descriptors (16 bytes each) +// struct vring_desc desc[num]; +// +// // A ring of available descriptor heads with free-running index. +// __virtio16 avail_flags; +// __virtio16 avail_idx; +// __virtio16 available[num]; +// __virtio16 used_event_idx; +// +// // Padding to the next align boundary. +// char pad[]; +// +// // A ring of used descriptor heads with free-running index. +// __virtio16 used_flags; +// __virtio16 used_idx; +// struct vring_used_elem used[num]; +// __virtio16 avail_event_idx; +// }; +// struct vring_desc { +// __virtio64 addr; +// __virtio32 len; +// __virtio16 flags; +// __virtio16 next; +// }; +// +// struct vring_avail { +// __virtio16 flags; +// __virtio16 idx; +// __virtio16 ring[]; +// }; +// +// // u32 is used here for ids for padding reasons. +// struct vring_used_elem { +// // Index of start of used descriptor chain. +// __virtio32 id; +// // Total length of the descriptor chain which was used (written to) +// __virtio32 len; +// }; +// +// Kernel header used for this reference: include/uapi/linux/virtio_ring.h +// Virtio Spec: https://docs.oasis-open.org/virtio/virtio/v1.2/csd01/virtio-v1.2-csd01.html +// +const VRING_DESC_ELEMENT_SIZE: usize = 16; +const VRING_AVAIL_ELEMENT_SIZE: usize = 2; +const VRING_USED_ELEMENT_SIZE: usize = 8; +#[derive(Debug)] +pub enum VringType { + Desc, + Avail, + Used, +} + +pub fn get_vring_size(t: VringType, queue_size: u16) -> u64 { + let (length_except_ring, element_size) = match t { + VringType::Desc => (0, VRING_DESC_ELEMENT_SIZE), + VringType::Avail => (6, VRING_AVAIL_ELEMENT_SIZE), + VringType::Used => (6, VRING_USED_ELEMENT_SIZE), + }; + (length_except_ring + element_size * queue_size as usize) as u64 +} + +/// Contains the data for reading and writing the common configuration structure of a virtio PCI +/// device. +/// +/// * Registers: +/// +/// ** About the whole device. +/// le32 device_feature_select; // 0x00 // read-write +/// le32 device_feature; // 0x04 // read-only for driver +/// le32 driver_feature_select; // 0x08 // read-write +/// le32 driver_feature; // 0x0C // read-write +/// le16 msix_config; // 0x10 // read-write +/// le16 num_queues; // 0x12 // read-only for driver +/// u8 device_status; // 0x14 // read-write (driver_status) +/// u8 config_generation; // 0x15 // read-only for driver +/// +/// ** About a specific virtqueue. +/// le16 queue_select; // 0x16 // read-write +/// le16 queue_size; // 0x18 // read-write, power of 2, or 0. +/// le16 queue_msix_vector; // 0x1A // read-write +/// le16 queue_enable; // 0x1C // read-write (Ready) +/// le16 queue_notify_off; // 0x1E // read-only for driver +/// le64 queue_desc; // 0x20 // read-write +/// le64 queue_avail; // 0x28 // read-write +/// le64 queue_used; // 0x30 // read-write +#[derive(Debug)] +pub struct VirtioPciCommonConfig { + pub driver_status: u8, + pub config_generation: u8, + pub device_feature_select: u32, + pub driver_feature_select: u32, + pub queue_select: u16, + pub msix_config: Arc, + pub msix_queues: Arc>>, +} + +impl VirtioPciCommonConfig { + pub fn new(state: VirtioPciCommonConfigState) -> Self { + VirtioPciCommonConfig { + driver_status: state.driver_status, + config_generation: state.config_generation, + device_feature_select: state.device_feature_select, + driver_feature_select: state.driver_feature_select, + queue_select: state.queue_select, + msix_config: Arc::new(AtomicU16::new(state.msix_config)), + msix_queues: Arc::new(Mutex::new(state.msix_queues)), + } + } + + fn state(&self) -> VirtioPciCommonConfigState { + VirtioPciCommonConfigState { + driver_status: self.driver_status, + config_generation: self.config_generation, + device_feature_select: self.device_feature_select, + driver_feature_select: self.driver_feature_select, + queue_select: self.queue_select, + msix_config: self.msix_config.load(Ordering::Acquire), + msix_queues: self.msix_queues.lock().unwrap().clone(), + } + } + + pub fn read(&mut self, offset: u64, data: &mut [u8], device: Arc>) { + assert!(data.len() <= 8); + + match data.len() { + 1 => { + let v = self.read_common_config_byte(offset); + data[0] = v; + } + 2 => { + let v = self.read_common_config_word(offset, device.lock().unwrap().queues()); + LittleEndian::write_u16(data, v); + } + 4 => { + let v = self.read_common_config_dword(offset, device); + LittleEndian::write_u32(data, v); + } + 8 => { + let v = self.read_common_config_qword(offset); + LittleEndian::write_u64(data, v); + } + _ => error!("invalid data length for virtio read: len {}", data.len()), + } + } + + pub fn write(&mut self, offset: u64, data: &[u8], device: Arc>) { + assert!(data.len() <= 8); + + match data.len() { + 1 => self.write_common_config_byte(offset, data[0]), + 2 => self.write_common_config_word( + offset, + LittleEndian::read_u16(data), + device.lock().unwrap().queues_mut(), + ), + 4 => self.write_common_config_dword(offset, LittleEndian::read_u32(data), device), + 8 => self.write_common_config_qword( + offset, + LittleEndian::read_u64(data), + device.lock().unwrap().queues_mut(), + ), + _ => error!("invalid data length for virtio write: len {}", data.len()), + } + } + + fn read_common_config_byte(&self, offset: u64) -> u8 { + debug!("read_common_config_byte: offset 0x{:x}", offset); + // The driver is only allowed to do aligned, properly sized access. + match offset { + 0x14 => self.driver_status, + 0x15 => self.config_generation, + _ => { + warn!("invalid virtio config byte read: 0x{:x}", offset); + 0 + } + } + } + + fn write_common_config_byte(&mut self, offset: u64, value: u8) { + debug!("write_common_config_byte: offset 0x{offset:x}: {value:x}"); + match offset { + 0x14 => self.driver_status = value, + _ => { + warn!("invalid virtio config byte write: 0x{:x}", offset); + } + } + } + + fn read_common_config_word(&self, offset: u64, queues: &[Queue]) -> u16 { + debug!("read_common_config_word: offset 0x{:x}", offset); + match offset { + 0x10 => self.msix_config.load(Ordering::Acquire), + 0x12 => queues.len().try_into().unwrap(), // num_queues + 0x16 => self.queue_select, + 0x18 => self.with_queue(queues, |q| q.size).unwrap_or(0), + 0x1a => self.msix_queues.lock().unwrap()[self.queue_select as usize], + 0x1c => u16::from(self.with_queue(queues, |q| q.ready).unwrap_or(false)), + 0x1e => self.queue_select, // notify_off + _ => { + warn!("invalid virtio register word read: 0x{:x}", offset); + 0 + } + } + } + + fn write_common_config_word(&mut self, offset: u64, value: u16, queues: &mut [Queue]) { + debug!("write_common_config_word: offset 0x{:x}", offset); + match offset { + 0x10 => self.msix_config.store(value, Ordering::Release), + 0x16 => self.queue_select = value, + 0x18 => self.with_queue_mut(queues, |q| q.size = value), + 0x1a => self.msix_queues.lock().unwrap()[self.queue_select as usize] = value, + 0x1c => self.with_queue_mut(queues, |q| { + q.ready = value == 1; + }), + _ => { + warn!("invalid virtio register word write: 0x{:x}", offset); + } + } + } + + fn read_common_config_dword(&self, offset: u64, device: Arc>) -> u32 { + debug!("read_common_config_dword: offset 0x{:x}", offset); + match offset { + 0x00 => self.device_feature_select, + 0x04 => { + let locked_device = device.lock().unwrap(); + // Only 64 bits of features (2 pages) are defined for now, so limit + // device_feature_select to avoid shifting by 64 or more bits. + if self.device_feature_select < 2 { + ((locked_device.avail_features() >> (self.device_feature_select * 32)) + & 0xffff_ffff) as u32 + } else { + 0 + } + } + 0x08 => self.driver_feature_select, + _ => { + warn!("invalid virtio register dword read: 0x{:x}", offset); + 0 + } + } + } + + fn write_common_config_dword( + &mut self, + offset: u64, + value: u32, + device: Arc>, + ) { + debug!("write_common_config_dword: offset 0x{:x}", offset); + fn hi(v: &mut GuestAddress, x: u32) { + *v = (*v & 0xffff_ffff) | (u64::from(x) << 32) + } + + fn lo(v: &mut GuestAddress, x: u32) { + *v = (*v & !0xffff_ffff) | u64::from(x) + } + + let mut locked_device = device.lock().unwrap(); + + match offset { + 0x00 => self.device_feature_select = value, + 0x08 => self.driver_feature_select = value, + 0x0c => locked_device.ack_features_by_page(self.driver_feature_select, value), + 0x20 => self.with_queue_mut(locked_device.queues_mut(), |q| { + lo(&mut q.desc_table_address, value) + }), + 0x24 => self.with_queue_mut(locked_device.queues_mut(), |q| { + hi(&mut q.desc_table_address, value) + }), + 0x28 => self.with_queue_mut(locked_device.queues_mut(), |q| { + lo(&mut q.avail_ring_address, value) + }), + 0x2c => self.with_queue_mut(locked_device.queues_mut(), |q| { + hi(&mut q.avail_ring_address, value) + }), + 0x30 => self.with_queue_mut(locked_device.queues_mut(), |q| { + lo(&mut q.used_ring_address, value) + }), + 0x34 => self.with_queue_mut(locked_device.queues_mut(), |q| { + hi(&mut q.used_ring_address, value) + }), + _ => { + warn!("invalid virtio register dword write: 0x{:x}", offset); + } + } + } + + fn read_common_config_qword(&self, _offset: u64) -> u64 { + debug!("read_common_config_qword: offset 0x{:x}", _offset); + 0 // Assume the guest has no reason to read write-only registers. + } + + fn write_common_config_qword(&mut self, offset: u64, value: u64, queues: &mut [Queue]) { + debug!("write_common_config_qword: offset 0x{:x}", offset); + + let low = Some((value & 0xffff_ffff) as u32); + let high = Some((value >> 32) as u32); + + match offset { + 0x20 => self.with_queue_mut(queues, |q| q.desc_table_address.0 = value), + 0x28 => self.with_queue_mut(queues, |q| q.avail_ring_address.0 = value), + 0x30 => self.with_queue_mut(queues, |q| q.used_ring_address.0 = value), + _ => { + warn!("invalid virtio register qword write: 0x{:x}", offset); + } + } + } + + fn with_queue(&self, queues: &[Queue], f: F) -> Option + where + F: FnOnce(&Queue) -> U, + { + queues.get(self.queue_select as usize).map(f) + } + + fn with_queue_mut(&self, queues: &mut [Queue], f: F) { + if let Some(queue) = queues.get_mut(self.queue_select as usize) { + f(queue); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::devices::virtio::transport::mmio::tests::DummyDevice; + + #[test] + fn write_base_regs() { + let mut regs = VirtioPciCommonConfig { + driver_status: 0xaa, + config_generation: 0x55, + device_feature_select: 0x0, + driver_feature_select: 0x0, + queue_select: 0xff, + msix_config: Arc::new(AtomicU16::new(0)), + msix_queues: Arc::new(Mutex::new(vec![0; 3])), + }; + + let dev = Arc::new(Mutex::new(DummyDevice::new())); + // Can set all bits of driver_status. + regs.write(0x14, &[0x55], dev.clone()); + let mut read_back = vec![0x00]; + regs.read(0x14, &mut read_back, dev.clone()); + assert_eq!(read_back[0], 0x55); + + // The config generation register is read only. + regs.write(0x15, &[0xaa], dev.clone()); + let mut read_back = vec![0x00]; + regs.read(0x15, &mut read_back, dev.clone()); + assert_eq!(read_back[0], 0x55); + + // Device features is read-only and passed through from the device. + regs.write(0x04, &[0, 0, 0, 0], dev.clone()); + let mut read_back = vec![0, 0, 0, 0]; + regs.read(0x04, &mut read_back, dev.clone()); + assert_eq!(LittleEndian::read_u32(&read_back), 0u32); + + // Feature select registers are read/write. + regs.write(0x00, &[1, 2, 3, 4], dev.clone()); + let mut read_back = vec![0, 0, 0, 0]; + regs.read(0x00, &mut read_back, dev.clone()); + assert_eq!(LittleEndian::read_u32(&read_back), 0x0403_0201); + regs.write(0x08, &[1, 2, 3, 4], dev.clone()); + let mut read_back = vec![0, 0, 0, 0]; + regs.read(0x08, &mut read_back, dev.clone()); + assert_eq!(LittleEndian::read_u32(&read_back), 0x0403_0201); + + // 'queue_select' can be read and written. + regs.write(0x16, &[0xaa, 0x55], dev.clone()); + let mut read_back = vec![0x00, 0x00]; + regs.read(0x16, &mut read_back, dev); + assert_eq!(read_back[0], 0xaa); + assert_eq!(read_back[1], 0x55); + } +} diff --git a/src/vmm/src/devices/virtio/transport/pci/device.rs b/src/vmm/src/devices/virtio/transport/pci/device.rs new file mode 100644 index 00000000000..20c169297fd --- /dev/null +++ b/src/vmm/src/devices/virtio/transport/pci/device.rs @@ -0,0 +1,1279 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// Copyright 2018 The Chromium OS Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause + +use std::any::Any; +use std::cmp; +use std::fmt::{Debug, Formatter}; +use std::io::Write; +use std::sync::atomic::{AtomicBool, AtomicU16, AtomicU32, AtomicUsize, Ordering}; +use std::sync::{Arc, Barrier, Mutex}; + +use anyhow::anyhow; +use kvm_ioctls::{IoEventAddress, NoDatamatch}; +use pci::{ + BarReprogrammingParams, MsixCap, MsixConfig, MsixConfigState, PciBarConfiguration, + PciBarRegionType, PciBdf, PciCapability, PciCapabilityId, PciClassCode, PciConfiguration, + PciConfigurationState, PciDevice, PciDeviceError, PciHeaderType, PciMassStorageSubclass, + PciNetworkControllerSubclass, PciSubclass, +}; +use serde::{Deserialize, Serialize}; +use thiserror::Error; +use vm_allocator::{AddressAllocator, AllocPolicy, RangeInclusive}; +use vm_device::interrupt::{InterruptIndex, InterruptSourceGroup, MsiIrqGroupConfig}; +use vm_device::{BusDevice, PciBarType, Resource}; +use vm_memory::{Address, ByteValued, GuestAddress, Le32}; +use vmm_sys_util::errno; +use vmm_sys_util::eventfd::EventFd; + +use crate::Vm; +use crate::device_manager::resources::ResourceAllocator; +use crate::devices::virtio::device::VirtioDevice; +use crate::devices::virtio::queue::Queue; +use crate::devices::virtio::transport::pci::common_config::{ + VirtioPciCommonConfig, VirtioPciCommonConfigState, +}; +use crate::devices::virtio::transport::{VirtioInterrupt, VirtioInterruptType}; +use crate::devices::virtio::{TYPE_BLOCK, TYPE_NET}; +use crate::logger::{debug, error}; +use crate::utils::u64_to_usize; +use crate::vstate::memory::GuestMemoryMmap; +use crate::vstate::vm::{InterruptError, MsiVectorGroup}; + +const DEVICE_INIT: u8 = 0x00; +const DEVICE_ACKNOWLEDGE: u8 = 0x01; +const DEVICE_DRIVER: u8 = 0x02; +const DEVICE_DRIVER_OK: u8 = 0x04; +const DEVICE_FEATURES_OK: u8 = 0x08; +const DEVICE_FAILED: u8 = 0x80; + +const VIRTIO_F_RING_INDIRECT_DESC: u32 = 28; +const VIRTIO_F_RING_EVENT_IDX: u32 = 29; +const VIRTIO_F_VERSION_1: u32 = 32; +const VIRTIO_F_IOMMU_PLATFORM: u32 = 33; +const VIRTIO_F_IN_ORDER: u32 = 35; +const VIRTIO_F_ORDER_PLATFORM: u32 = 36; +#[allow(dead_code)] +const VIRTIO_F_SR_IOV: u32 = 37; +const VIRTIO_F_NOTIFICATION_DATA: u32 = 38; + +/// Vector value used to disable MSI for a queue. +const VIRTQ_MSI_NO_VECTOR: u16 = 0xffff; + +enum PciCapabilityType { + Common = 1, + Notify = 2, + Isr = 3, + Device = 4, + Pci = 5, + SharedMemory = 8, +} + +// This offset represents the 2 bytes omitted from the VirtioPciCap structure +// as they are already handled through add_capability(). These 2 bytes are the +// fields cap_vndr (1 byte) and cap_next (1 byte) defined in the virtio spec. +const VIRTIO_PCI_CAP_OFFSET: usize = 2; + +#[allow(dead_code)] +#[repr(C, packed)] +#[derive(Debug, Clone, Copy, Default)] +struct VirtioPciCap { + cap_len: u8, // Generic PCI field: capability length + cfg_type: u8, // Identifies the structure. + pci_bar: u8, // Where to find it. + id: u8, // Multiple capabilities of the same type + padding: [u8; 2], // Pad to full dword. + offset: Le32, // Offset within bar. + length: Le32, // Length of the structure, in bytes. +} + +// SAFETY: All members are simple numbers and any value is valid. +unsafe impl ByteValued for VirtioPciCap {} + +impl PciCapability for VirtioPciCap { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::VendorSpecific + } +} + +const VIRTIO_PCI_CAP_LEN_OFFSET: u8 = 2; + +impl VirtioPciCap { + pub fn new(cfg_type: PciCapabilityType, pci_bar: u8, offset: u32, length: u32) -> Self { + VirtioPciCap { + cap_len: u8::try_from(std::mem::size_of::()).unwrap() + + VIRTIO_PCI_CAP_LEN_OFFSET, + cfg_type: cfg_type as u8, + pci_bar, + id: 0, + padding: [0; 2], + offset: Le32::from(offset), + length: Le32::from(length), + } + } +} + +#[allow(dead_code)] +#[repr(C, packed)] +#[derive(Clone, Copy, Default)] +struct VirtioPciNotifyCap { + cap: VirtioPciCap, + notify_off_multiplier: Le32, +} +// SAFETY: All members are simple numbers and any value is valid. +unsafe impl ByteValued for VirtioPciNotifyCap {} + +impl PciCapability for VirtioPciNotifyCap { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::VendorSpecific + } +} + +impl VirtioPciNotifyCap { + pub fn new( + cfg_type: PciCapabilityType, + pci_bar: u8, + offset: u32, + length: u32, + multiplier: Le32, + ) -> Self { + VirtioPciNotifyCap { + cap: VirtioPciCap { + cap_len: u8::try_from(std::mem::size_of::()).unwrap() + + VIRTIO_PCI_CAP_LEN_OFFSET, + cfg_type: cfg_type as u8, + pci_bar, + id: 0, + padding: [0; 2], + offset: Le32::from(offset), + length: Le32::from(length), + }, + notify_off_multiplier: multiplier, + } + } +} + +#[allow(dead_code)] +#[repr(C, packed)] +#[derive(Clone, Copy, Default)] +struct VirtioPciCap64 { + cap: VirtioPciCap, + offset_hi: Le32, + length_hi: Le32, +} +// SAFETY: All members are simple numbers and any value is valid. +unsafe impl ByteValued for VirtioPciCap64 {} + +impl PciCapability for VirtioPciCap64 { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::VendorSpecific + } +} + +impl VirtioPciCap64 { + pub fn new(cfg_type: PciCapabilityType, pci_bar: u8, id: u8, offset: u64, length: u64) -> Self { + VirtioPciCap64 { + cap: VirtioPciCap { + cap_len: u8::try_from(std::mem::size_of::()).unwrap() + + VIRTIO_PCI_CAP_LEN_OFFSET, + cfg_type: cfg_type as u8, + pci_bar, + id, + padding: [0; 2], + offset: Le32::from((offset & 0xffff_ffff) as u32), + length: Le32::from((length & 0xffff_ffff) as u32), + }, + offset_hi: Le32::from((offset >> 32) as u32), + length_hi: Le32::from((length >> 32) as u32), + } + } +} + +#[allow(dead_code)] +#[repr(C, packed)] +#[derive(Debug, Clone, Copy, Default)] +struct VirtioPciCfgCap { + cap: VirtioPciCap, + pci_cfg_data: [u8; 4], +} +// SAFETY: All members are simple numbers and any value is valid. +unsafe impl ByteValued for VirtioPciCfgCap {} + +impl PciCapability for VirtioPciCfgCap { + fn bytes(&self) -> &[u8] { + self.as_slice() + } + + fn id(&self) -> PciCapabilityId { + PciCapabilityId::VendorSpecific + } +} + +impl VirtioPciCfgCap { + fn new() -> Self { + VirtioPciCfgCap { + cap: VirtioPciCap::new(PciCapabilityType::Pci, 0, 0, 0), + ..Default::default() + } + } +} + +#[derive(Debug, Clone, Copy, Default)] +struct VirtioPciCfgCapInfo { + offset: usize, + cap: VirtioPciCfgCap, +} + +#[allow(dead_code)] +#[derive(Debug, Copy, Clone)] +pub enum PciVirtioSubclass { + NonTransitionalBase = 0xff, +} + +impl PciSubclass for PciVirtioSubclass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +// Allocate one bar for the structs pointed to by the capability structures. +// As per the PCI specification, because the same BAR shares MSI-X and non +// MSI-X structures, it is recommended to use 8KiB alignment for all those +// structures. +const COMMON_CONFIG_BAR_OFFSET: u64 = 0x0000; +const COMMON_CONFIG_SIZE: u64 = 56; +const ISR_CONFIG_BAR_OFFSET: u64 = 0x2000; +const ISR_CONFIG_SIZE: u64 = 1; +const DEVICE_CONFIG_BAR_OFFSET: u64 = 0x4000; +const DEVICE_CONFIG_SIZE: u64 = 0x1000; +const NOTIFICATION_BAR_OFFSET: u64 = 0x6000; +const NOTIFICATION_SIZE: u64 = 0x1000; +const MSIX_TABLE_BAR_OFFSET: u64 = 0x8000; +// The size is 256KiB because the table can hold up to 2048 entries, with each +// entry being 128 bits (4 DWORDS). +const MSIX_TABLE_SIZE: u64 = 0x40000; +const MSIX_PBA_BAR_OFFSET: u64 = 0x48000; +// The size is 2KiB because the Pending Bit Array has one bit per vector and it +// can support up to 2048 vectors. +const MSIX_PBA_SIZE: u64 = 0x800; +// The BAR size must be a power of 2. +const CAPABILITY_BAR_SIZE: u64 = 0x80000; +const VIRTIO_COMMON_BAR_INDEX: usize = 0; +const VIRTIO_SHM_BAR_INDEX: usize = 2; + +const NOTIFY_OFF_MULTIPLIER: u32 = 4; // A dword per notification address. + +const VIRTIO_PCI_VENDOR_ID: u16 = 0x1af4; +const VIRTIO_PCI_DEVICE_ID_BASE: u16 = 0x1040; // Add to device type to get device ID. + +#[derive(Debug, Serialize, Deserialize)] +struct QueueState { + max_size: u16, + size: u16, + ready: bool, + desc_table: u64, + avail_ring: u64, + used_ring: u64, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct VirtioPciDeviceState { + pub pci_device_bdf: PciBdf, + device_activated: bool, + queues: Vec, + interrupt_status: usize, + cap_pci_cfg_offset: usize, + cap_pci_cfg: Vec, +} + +#[derive(Debug, thiserror::Error, displaydoc::Display)] +pub enum VirtioPciDeviceError { + /// Failed creating VirtioPciDevice: {0} + CreateVirtioPciDevice(#[from] anyhow::Error), + /// Error creating MSI configuration: {0} + Msi(#[from] pci::MsixError), +} +pub type Result = std::result::Result; + +pub struct VirtioPciDevice { + id: String, + + // BDF assigned to the device + pci_device_bdf: PciBdf, + + // PCI configuration registers. + configuration: PciConfiguration, + + // virtio PCI common configuration + common_config: VirtioPciCommonConfig, + + // MSI-X config + msix_config: Option>>, + + // Number of MSI-X vectors + msix_num: u16, + + // Virtio device reference and status + device: Arc>, + device_activated: Arc, + + // PCI interrupts. + interrupt_status: Arc, + virtio_interrupt: Option>, + interrupt_source_group: Arc, + + // Guest memory + memory: GuestMemoryMmap, + + // Settings PCI BAR + settings_bar: u8, + + // Whether to use 64-bit bar location or 32-bit + use_64bit_bar: bool, + + // Add a dedicated structure to hold information about the very specific + // virtio-pci capability VIRTIO_PCI_CAP_PCI_CFG. This is needed to support + // the legacy/backward compatible mechanism of letting the guest access the + // other virtio capabilities without mapping the PCI BARs. This can be + // needed when the guest tries to early access the virtio configuration of + // a device. + cap_pci_cfg_info: VirtioPciCfgCapInfo, + + // Details of bar regions to free + pub bar_regions: Vec, +} + +impl Debug for VirtioPciDevice { + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + f.debug_struct("VirtioPciDevice") + .field("id", &self.id) + .finish() + } +} + +impl VirtioPciDevice { + fn pci_configuration( + virtio_device_type: u32, + msix_config: &Arc>, + pci_config_state: Option, + ) -> PciConfiguration { + let pci_device_id = VIRTIO_PCI_DEVICE_ID_BASE + u16::try_from(virtio_device_type).unwrap(); + let (class, subclass) = match virtio_device_type { + TYPE_NET => ( + PciClassCode::NetworkController, + &PciNetworkControllerSubclass::EthernetController as &dyn PciSubclass, + ), + TYPE_BLOCK => ( + PciClassCode::MassStorage, + &PciMassStorageSubclass::MassStorage as &dyn PciSubclass, + ), + _ => ( + PciClassCode::Other, + &PciVirtioSubclass::NonTransitionalBase as &dyn PciSubclass, + ), + }; + + PciConfiguration::new( + VIRTIO_PCI_VENDOR_ID, + pci_device_id, + 0x1, // For modern virtio-PCI devices + class, + subclass, + None, + PciHeaderType::Device, + VIRTIO_PCI_VENDOR_ID, + pci_device_id, + Some(msix_config.clone()), + pci_config_state, + ) + } + + fn msix_config( + pci_device_bdf: u32, + msix_vectors: Arc, + msix_config_state: Option, + ) -> Result>> { + let msix_config = Arc::new(Mutex::new(MsixConfig::new( + msix_vectors.num_vectors(), + msix_vectors, + pci_device_bdf, + msix_config_state, + )?)); + + Ok(msix_config) + } + + /// Constructs a new PCI transport for the given virtio device. + #[allow(clippy::too_many_arguments)] + pub fn new( + id: String, + memory: GuestMemoryMmap, + device: Arc>, + msi_vectors: Arc, + pci_device_bdf: u32, + ) -> Result { + let num_queues = device.lock().expect("Poisoned lock").queues().len(); + + let msix_config = Self::msix_config(pci_device_bdf, msi_vectors.clone(), None)?; + let pci_config = Self::pci_configuration( + device.lock().expect("Poisoned lock").device_type(), + &msix_config, + None, + ); + + let virtio_common_config = VirtioPciCommonConfig::new(VirtioPciCommonConfigState { + driver_status: 0, + config_generation: 0, + device_feature_select: 0, + driver_feature_select: 0, + queue_select: 0, + msix_config: VIRTQ_MSI_NO_VECTOR, + msix_queues: vec![VIRTQ_MSI_NO_VECTOR; num_queues], + }); + let interrupt = Arc::new(VirtioInterruptMsix::new( + msix_config.clone(), + virtio_common_config.msix_config.clone(), + virtio_common_config.msix_queues.clone(), + msi_vectors.clone(), + )); + + let virtio_pci_device = VirtioPciDevice { + id, + pci_device_bdf: pci_device_bdf.into(), + configuration: pci_config, + common_config: virtio_common_config, + msix_config: Some(msix_config), + msix_num: msi_vectors.num_vectors(), + device, + device_activated: Arc::new(AtomicBool::new(false)), + interrupt_status: Arc::new(AtomicUsize::new(0)), + virtio_interrupt: None, + memory, + settings_bar: 0, + use_64bit_bar: true, + interrupt_source_group: msi_vectors, + cap_pci_cfg_info: VirtioPciCfgCapInfo::default(), + bar_regions: vec![], + }; + + Ok(virtio_pci_device) + } + + fn is_driver_ready(&self) -> bool { + let ready_bits = + (DEVICE_ACKNOWLEDGE | DEVICE_DRIVER | DEVICE_DRIVER_OK | DEVICE_FEATURES_OK); + self.common_config.driver_status == ready_bits + && self.common_config.driver_status & DEVICE_FAILED == 0 + } + + /// Determines if the driver has requested the device (re)init / reset itself + fn is_driver_init(&self) -> bool { + self.common_config.driver_status == DEVICE_INIT + } + + pub fn config_bar_addr(&self) -> u64 { + self.configuration.get_bar_addr(self.settings_bar as usize) + } + + fn add_pci_capabilities( + &mut self, + settings_bar: u8, + ) -> std::result::Result<(), PciDeviceError> { + // Add pointers to the different configuration structures from the PCI capabilities. + let common_cap = VirtioPciCap::new( + PciCapabilityType::Common, + settings_bar, + COMMON_CONFIG_BAR_OFFSET.try_into().unwrap(), + COMMON_CONFIG_SIZE.try_into().unwrap(), + ); + self.configuration + .add_capability(&common_cap) + .map_err(PciDeviceError::CapabilitiesSetup)?; + + let isr_cap = VirtioPciCap::new( + PciCapabilityType::Isr, + settings_bar, + ISR_CONFIG_BAR_OFFSET.try_into().unwrap(), + ISR_CONFIG_SIZE.try_into().unwrap(), + ); + self.configuration + .add_capability(&isr_cap) + .map_err(PciDeviceError::CapabilitiesSetup)?; + + // TODO(dgreid) - set based on device's configuration size? + let device_cap = VirtioPciCap::new( + PciCapabilityType::Device, + settings_bar, + DEVICE_CONFIG_BAR_OFFSET.try_into().unwrap(), + DEVICE_CONFIG_SIZE.try_into().unwrap(), + ); + self.configuration + .add_capability(&device_cap) + .map_err(PciDeviceError::CapabilitiesSetup)?; + + let notify_cap = VirtioPciNotifyCap::new( + PciCapabilityType::Notify, + settings_bar, + NOTIFICATION_BAR_OFFSET.try_into().unwrap(), + NOTIFICATION_SIZE.try_into().unwrap(), + Le32::from(NOTIFY_OFF_MULTIPLIER), + ); + self.configuration + .add_capability(¬ify_cap) + .map_err(PciDeviceError::CapabilitiesSetup)?; + + let configuration_cap = VirtioPciCfgCap::new(); + self.cap_pci_cfg_info.offset = self + .configuration + .add_capability(&configuration_cap) + .map_err(PciDeviceError::CapabilitiesSetup)? + + VIRTIO_PCI_CAP_OFFSET; + self.cap_pci_cfg_info.cap = configuration_cap; + + if self.msix_config.is_some() { + let msix_cap = MsixCap::new( + settings_bar, + self.msix_num, + MSIX_TABLE_BAR_OFFSET.try_into().unwrap(), + settings_bar, + MSIX_PBA_BAR_OFFSET.try_into().unwrap(), + ); + self.configuration + .add_capability(&msix_cap) + .map_err(PciDeviceError::CapabilitiesSetup)?; + } + + self.settings_bar = settings_bar; + Ok(()) + } + + fn read_cap_pci_cfg(&mut self, offset: usize, mut data: &mut [u8]) { + let cap_slice = self.cap_pci_cfg_info.cap.as_slice(); + let data_len = data.len(); + let cap_len = cap_slice.len(); + if offset + data_len > cap_len { + error!("Failed to read cap_pci_cfg from config space"); + return; + } + + if offset < std::mem::size_of::() { + if let Some(end) = offset.checked_add(data_len) { + // This write can't fail, offset and end are checked against config_len. + data.write_all(&cap_slice[offset..cmp::min(end, cap_len)]) + .unwrap(); + } + } else { + let bar_offset: u32 = + // SAFETY: we know self.cap_pci_cfg_info.cap.cap.offset is 32bits long. + unsafe { std::mem::transmute(self.cap_pci_cfg_info.cap.cap.offset) }; + self.read_bar(0, bar_offset as u64, data) + } + } + + fn write_cap_pci_cfg(&mut self, offset: usize, data: &[u8]) -> Option> { + let cap_slice = self.cap_pci_cfg_info.cap.as_mut_slice(); + let data_len = data.len(); + let cap_len = cap_slice.len(); + if offset + data_len > cap_len { + error!("Failed to write cap_pci_cfg to config space"); + return None; + } + + if offset < std::mem::size_of::() { + let (_, right) = cap_slice.split_at_mut(offset); + right[..data_len].copy_from_slice(data); + None + } else { + let bar_offset: u32 = + // SAFETY: we know self.cap_pci_cfg_info.cap.cap.offset is 32bits long. + unsafe { std::mem::transmute(self.cap_pci_cfg_info.cap.cap.offset) }; + self.write_bar(0, bar_offset as u64, data) + } + } + + pub fn virtio_device(&self) -> Arc> { + self.device.clone() + } + + fn needs_activation(&self) -> bool { + !self.device_activated.load(Ordering::SeqCst) && self.is_driver_ready() + } + + /// Register the IoEvent notification for a VirtIO device + pub fn register_notification_ioevent(&self, vm: &Vm) -> std::result::Result<(), errno::Error> { + let bar_addr = self.config_bar_addr(); + for (i, queue_evt) in self + .device + .lock() + .expect("Poisoned lock") + .queue_events() + .iter() + .enumerate() + { + let notify_base = bar_addr + NOTIFICATION_BAR_OFFSET; + let io_addr = + IoEventAddress::Mmio(notify_base + i as u64 * NOTIFY_OFF_MULTIPLIER as u64); + vm.fd().register_ioevent(queue_evt, &io_addr, NoDatamatch)?; + } + Ok(()) + } + + /// Unregister the IoEvent notification for a VirtIO device + pub fn unregister_notification_ioevent( + &self, + vm: &Vm, + ) -> std::result::Result<(), errno::Error> { + let bar_addr = self.config_bar_addr(); + for (i, queue_evt) in self + .device + .lock() + .expect("Poisoned lock") + .queue_events() + .iter() + .enumerate() + { + let notify_base = bar_addr + NOTIFICATION_BAR_OFFSET; + let io_addr = + IoEventAddress::Mmio(notify_base + i as u64 * NOTIFY_OFF_MULTIPLIER as u64); + vm.fd() + .unregister_ioevent(queue_evt, &io_addr, NoDatamatch)?; + } + Ok(()) + } +} + +pub struct VirtioInterruptMsix { + msix_config: Arc>, + config_vector: Arc, + queues_vectors: Arc>>, + interrupt_source_group: Arc, +} + +impl std::fmt::Debug for VirtioInterruptMsix { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("VirtioInterruptMsix") + .field("msix_config", &self.msix_config) + .field("config_vector", &self.config_vector) + .field("queues_vectors", &self.queues_vectors) + .finish() + } +} + +impl VirtioInterruptMsix { + pub fn new( + msix_config: Arc>, + config_vector: Arc, + queues_vectors: Arc>>, + interrupt_source_group: Arc, + ) -> Self { + VirtioInterruptMsix { + msix_config, + config_vector, + queues_vectors, + interrupt_source_group, + } + } +} + +impl VirtioInterrupt for VirtioInterruptMsix { + fn trigger(&self, int_type: VirtioInterruptType) -> std::result::Result<(), std::io::Error> { + let vector = match int_type { + VirtioInterruptType::Config => self.config_vector.load(Ordering::Acquire), + VirtioInterruptType::Queue(queue_index) => { + self.queues_vectors.lock().unwrap()[queue_index as usize] + } + }; + + if vector == VIRTQ_MSI_NO_VECTOR { + return Ok(()); + } + + let config = &mut self.msix_config.lock().unwrap(); + let entry = &config.table_entries[vector as usize]; + // In case the vector control register associated with the entry + // has its first bit set, this means the vector is masked and the + // device should not inject the interrupt. + // Instead, the Pending Bit Array table is updated to reflect there + // is a pending interrupt for this specific vector. + if config.masked() || entry.masked() { + config.set_pba_bit(vector, false); + return Ok(()); + } + + self.interrupt_source_group + .trigger(vector as InterruptIndex) + } + + fn notifier(&self, int_type: VirtioInterruptType) -> Option<&EventFd> { + let vector = match int_type { + VirtioInterruptType::Config => self.config_vector.load(Ordering::Acquire), + VirtioInterruptType::Queue(queue_index) => { + self.queues_vectors.lock().unwrap()[queue_index as usize] + } + }; + + self.interrupt_source_group + .notifier(vector as InterruptIndex) + } + + fn status(&self) -> Arc { + Arc::new(AtomicU32::new(0)) + } + + #[cfg(test)] + fn has_pending_interrupt(&self, interrupt_type: VirtioInterruptType) -> bool { + false + } +} + +impl PciDevice for VirtioPciDevice { + fn write_config_register( + &mut self, + reg_idx: usize, + offset: u64, + data: &[u8], + ) -> Option> { + // Handle the special case where the capability VIRTIO_PCI_CAP_PCI_CFG + // is accessed. This capability has a special meaning as it allows the + // guest to access other capabilities without mapping the PCI BAR. + let base = reg_idx * 4; + if base + u64_to_usize(offset) >= self.cap_pci_cfg_info.offset + && base + u64_to_usize(offset) + data.len() + <= self.cap_pci_cfg_info.offset + self.cap_pci_cfg_info.cap.bytes().len() + { + let offset = base + u64_to_usize(offset) - self.cap_pci_cfg_info.offset; + self.write_cap_pci_cfg(offset, data) + } else { + self.configuration + .write_config_register(reg_idx, offset, data); + None + } + } + + fn read_config_register(&mut self, reg_idx: usize) -> u32 { + // Handle the special case where the capability VIRTIO_PCI_CAP_PCI_CFG + // is accessed. This capability has a special meaning as it allows the + // guest to access other capabilities without mapping the PCI BAR. + let base = reg_idx * 4; + if base >= self.cap_pci_cfg_info.offset + && base + 4 <= self.cap_pci_cfg_info.offset + self.cap_pci_cfg_info.cap.bytes().len() + { + let offset = base - self.cap_pci_cfg_info.offset; + let mut data = [0u8; 4]; + self.read_cap_pci_cfg(offset, &mut data); + u32::from_le_bytes(data) + } else { + self.configuration.read_reg(reg_idx) + } + } + + fn detect_bar_reprogramming( + &mut self, + reg_idx: usize, + data: &[u8], + ) -> Option { + self.configuration.detect_bar_reprogramming(reg_idx, data) + } + + fn allocate_bars( + &mut self, + mmio32_allocator: &mut AddressAllocator, + mmio64_allocator: &mut AddressAllocator, + resources: Option>, + ) -> std::result::Result, PciDeviceError> { + let mut bars = Vec::new(); + let device_clone = self.device.clone(); + let device = device_clone.lock().unwrap(); + + let mut settings_bar_addr = None; + let mut use_64bit_bar = self.use_64bit_bar; + let restoring = resources.is_some(); + if let Some(resources) = resources { + for resource in resources { + if let Resource::PciBar { + index, base, type_, .. + } = resource + { + if index == VIRTIO_COMMON_BAR_INDEX { + settings_bar_addr = Some(GuestAddress(base)); + use_64bit_bar = match type_ { + PciBarType::Io => { + return Err(PciDeviceError::InvalidResource(resource)); + } + PciBarType::Mmio32 => false, + PciBarType::Mmio64 => true, + }; + break; + } + } + } + // Error out if no resource was matching the BAR id. + if settings_bar_addr.is_none() { + return Err(PciDeviceError::MissingResource); + } + } + + // Allocate the virtio-pci capability BAR. + // See http://docs.oasis-open.org/virtio/virtio/v1.0/cs04/virtio-v1.0-cs04.html#x1-740004 + let policy = match settings_bar_addr { + Some(addr) => AllocPolicy::ExactMatch(addr.0), + None => AllocPolicy::FirstMatch, + }; + let (virtio_pci_bar_addr, region_type) = if use_64bit_bar { + let region_type = PciBarRegionType::Memory64BitRegion; + let addr = mmio64_allocator + .allocate(CAPABILITY_BAR_SIZE, CAPABILITY_BAR_SIZE, policy) + .unwrap() + .start(); + (addr, region_type) + } else { + let region_type = PciBarRegionType::Memory32BitRegion; + let addr = mmio32_allocator + .allocate(CAPABILITY_BAR_SIZE, CAPABILITY_BAR_SIZE, policy) + .unwrap() + .start(); + (addr, region_type) + }; + + let bar = PciBarConfiguration::default() + .set_index(VIRTIO_COMMON_BAR_INDEX) + .set_address(virtio_pci_bar_addr) + .set_size(CAPABILITY_BAR_SIZE) + .set_region_type(region_type); + + // The creation of the PCI BAR and its associated capabilities must + // happen only during the creation of a brand new VM. When a VM is + // restored from a known state, the BARs are already created with the + // right content, therefore we don't need to go through this codepath. + if !restoring { + self.configuration + .add_pci_bar(&bar) + .map_err(|e| PciDeviceError::IoRegistrationFailed(virtio_pci_bar_addr, e))?; + + // Once the BARs are allocated, the capabilities can be added to the PCI configuration. + self.add_pci_capabilities(VIRTIO_COMMON_BAR_INDEX.try_into().unwrap())?; + } + + bars.push(bar); + + self.bar_regions.clone_from(&bars); + + Ok(bars) + } + + fn free_bars( + &mut self, + mmio32_allocator: &mut AddressAllocator, + mmio64_allocator: &mut AddressAllocator, + ) -> std::result::Result<(), PciDeviceError> { + for bar in self.bar_regions.drain(..) { + let range = RangeInclusive::new(bar.addr(), bar.addr() + bar.size()).unwrap(); + match bar.region_type() { + PciBarRegionType::Memory32BitRegion => { + mmio32_allocator.free(&range); + } + PciBarRegionType::Memory64BitRegion => { + mmio64_allocator.free(&range); + } + _ => error!("Unexpected PCI bar type"), + } + } + Ok(()) + } + + fn move_bar( + &mut self, + old_base: u64, + new_base: u64, + ) -> std::result::Result<(), std::io::Error> { + // We only update our idea of the bar in order to support free_bars() above. + // The majority of the reallocation is done inside DeviceManager. + for bar in self.bar_regions.iter_mut() { + if bar.addr() == old_base { + *bar = bar.set_address(new_base); + } + } + + Ok(()) + } + + fn read_bar(&mut self, _base: u64, offset: u64, data: &mut [u8]) { + match offset { + o if o < COMMON_CONFIG_BAR_OFFSET + COMMON_CONFIG_SIZE => { + self.common_config + .read(o - COMMON_CONFIG_BAR_OFFSET, data, self.device.clone()) + } + o if (ISR_CONFIG_BAR_OFFSET..ISR_CONFIG_BAR_OFFSET + ISR_CONFIG_SIZE).contains(&o) => { + if let Some(v) = data.get_mut(0) { + // Reading this register resets it to 0. + *v = self + .interrupt_status + .swap(0, Ordering::AcqRel) + .try_into() + .unwrap(); + } + } + o if (DEVICE_CONFIG_BAR_OFFSET..DEVICE_CONFIG_BAR_OFFSET + DEVICE_CONFIG_SIZE) + .contains(&o) => + { + let device = self.device.lock().unwrap(); + device.read_config(o - DEVICE_CONFIG_BAR_OFFSET, data); + } + o if (NOTIFICATION_BAR_OFFSET..NOTIFICATION_BAR_OFFSET + NOTIFICATION_SIZE) + .contains(&o) => + { + // Handled with ioeventfds. + } + o if (MSIX_TABLE_BAR_OFFSET..MSIX_TABLE_BAR_OFFSET + MSIX_TABLE_SIZE).contains(&o) => { + if let Some(msix_config) = &self.msix_config { + msix_config + .lock() + .unwrap() + .read_table(o - MSIX_TABLE_BAR_OFFSET, data); + } + } + o if (MSIX_PBA_BAR_OFFSET..MSIX_PBA_BAR_OFFSET + MSIX_PBA_SIZE).contains(&o) => { + if let Some(msix_config) = &self.msix_config { + msix_config + .lock() + .unwrap() + .read_pba(o - MSIX_PBA_BAR_OFFSET, data); + } + } + _ => (), + } + } + + fn write_bar(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { + match offset { + o if o < COMMON_CONFIG_BAR_OFFSET + COMMON_CONFIG_SIZE => { + self.common_config + .write(o - COMMON_CONFIG_BAR_OFFSET, data, self.device.clone()) + } + o if (ISR_CONFIG_BAR_OFFSET..ISR_CONFIG_BAR_OFFSET + ISR_CONFIG_SIZE).contains(&o) => { + if let Some(v) = data.first() { + self.interrupt_status + .fetch_and(!(*v as usize), Ordering::AcqRel); + } + } + o if (DEVICE_CONFIG_BAR_OFFSET..DEVICE_CONFIG_BAR_OFFSET + DEVICE_CONFIG_SIZE) + .contains(&o) => + { + let mut device = self.device.lock().unwrap(); + device.write_config(o - DEVICE_CONFIG_BAR_OFFSET, data); + } + o if (NOTIFICATION_BAR_OFFSET..NOTIFICATION_BAR_OFFSET + NOTIFICATION_SIZE) + .contains(&o) => + { + // Handled with ioeventfds. + error!("Unexpected write to notification BAR: offset = 0x{:x}", o); + } + o if (MSIX_TABLE_BAR_OFFSET..MSIX_TABLE_BAR_OFFSET + MSIX_TABLE_SIZE).contains(&o) => { + if let Some(msix_config) = &self.msix_config { + msix_config + .lock() + .unwrap() + .write_table(o - MSIX_TABLE_BAR_OFFSET, data); + } + } + o if (MSIX_PBA_BAR_OFFSET..MSIX_PBA_BAR_OFFSET + MSIX_PBA_SIZE).contains(&o) => { + if let Some(msix_config) = &self.msix_config { + msix_config + .lock() + .unwrap() + .write_pba(o - MSIX_PBA_BAR_OFFSET, data); + } + } + _ => (), + }; + + // Try and activate the device if the driver status has changed + if self.needs_activation() { + debug!("Activating device"); + self.virtio_device() + .lock() + .unwrap() + .activate( + self.memory.clone(), + Arc::clone(self.virtio_interrupt.as_ref().unwrap()), + ) + .unwrap_or_else(|err| error!("Error activating device: {err:?}")); + } else { + debug!("Device doesn't need activation"); + } + + // Device has been reset by the driver + if self.device_activated.load(Ordering::SeqCst) && self.is_driver_init() { + let mut device = self.device.lock().unwrap(); + let reset_result = device.reset(); + match reset_result { + Some((virtio_interrupt, mut _queue_evts)) => { + // Upon reset the device returns its interrupt EventFD + self.virtio_interrupt = Some(virtio_interrupt); + self.device_activated.store(false, Ordering::SeqCst); + + // Reset queue readiness (changes queue_enable), queue sizes + // and selected_queue as per spec for reset + self.virtio_device() + .lock() + .unwrap() + .queues_mut() + .iter_mut() + .for_each(Queue::reset); + self.common_config.queue_select = 0; + } + None => { + error!("Attempt to reset device when not implemented in underlying device"); + self.common_config.driver_status = DEVICE_FAILED; + } + } + } + + None + } + + fn id(&self) -> Option { + Some(self.id.clone()) + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } +} + +impl BusDevice for VirtioPciDevice { + fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) { + self.read_bar(base, offset, data) + } + + fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option> { + self.write_bar(base, offset, data) + } +} + +#[cfg(test)] +mod tests { + use std::sync::{Arc, Mutex}; + + use event_manager::MutEventSubscriber; + use linux_loader::loader::Cmdline; + use pci::{PciBdf, PciClassCode, PciDevice, PciSubclass}; + + use super::VirtioPciDevice; + use crate::Vm; + use crate::arch::MEM_64BIT_DEVICES_START; + use crate::builder::tests::default_vmm; + use crate::devices::virtio::device::VirtioDevice; + use crate::devices::virtio::rng::Entropy; + use crate::devices::virtio::transport::pci::device::PciVirtioSubclass; + use crate::rate_limiter::RateLimiter; + + #[test] + fn test_pci_device_config() { + let mut vmm = default_vmm(); + vmm.device_manager.enable_pci(); + let entropy = Arc::new(Mutex::new(Entropy::new(RateLimiter::default()).unwrap())); + vmm.device_manager + .attach_virtio_device( + &vmm.vm, + "rng".to_string(), + entropy.clone(), + &mut Cmdline::new(1024).unwrap(), + false, + ) + .unwrap(); + + let device = vmm + .device_manager + .pci_devices + .get_virtio_device(entropy.lock().unwrap().device_type(), "rng") + .unwrap(); + + let mut locked_virtio_pci_device = device.lock().unwrap(); + + // For more information for the values we are checking here look into the VirtIO spec here: + // https://docs.oasis-open.org/virtio/virtio/v1.1/csprd01/virtio-v1.1-csprd01.html#x1-1220007 + // and PCI Header type 0 layout here: https://wiki.osdev.org/PCI#Configuration_Space + + // | 16 bits | 16 bits | + // |-----------|-----------| + // regiger 0x0: | Device ID | Vendor ID | + // + // Vendor ID of VirtIO devices is 0x1af4 + let reg0 = locked_virtio_pci_device.read_config_register(0); + assert_eq!(reg0 & 0xffff, 0x1af4); + // VirtIO PCI device IDs are in the range [0x1000, 0x107f]. (We are not using transitional + // device IDs). + let devid = reg0 >> 16; + assert!( + (0x1000..=0x107f).contains(&devid), + "Device ID check: {:#x} >= 0x1000 && {:#x} <= 0x107f", + devid, + devid + ); + + // | 16 bits | 16 bits | + // |------------|-----------| + // regiger 0x1: | Status | Command | + // We offer the capabilities list (bit 4 of status register) at offset 0x34 + let reg1 = locked_virtio_pci_device.read_config_register(1); + assert_eq!(reg1, 0x0010_0000); + + // | 8 bits | 8 bits | 8 bits | 8 bits | + // register 0x2: | Class code | Subclass | Prog IF | Revision ID | + // + // Class code: VIRTIO_PCI_VENDOR_ID for all VirtIO devices + // Subclass: PciClassCode::NetworkController for net, PciClassCode::MassStore for block + // PciClassCode::Other for everything else + // Prog IF: A register defining some programmable interface register. 0 for VirtIO devices + // Revision ID: 0x1 for modern VirtIO devices + let reg2 = locked_virtio_pci_device.read_config_register(2); + assert_eq!(reg2, 0xffff_0001); + let class_code = ((reg2 >> 24) & 0xff) as u8; + assert_eq!(class_code, PciClassCode::Other.get_register_value()); + let subclass = ((reg2 >> 16) & 0xff) as u8; + assert_eq!( + subclass, + PciVirtioSubclass::NonTransitionalBase.get_register_value() + ); + let prog_if = ((reg2 >> 8) & 0xff) as u8; + assert_eq!(prog_if, 0); + let revision_id = reg2 & 0xff; + assert_eq!(revision_id, 0x1); + + // | 8 bits | 8 bits | 8 bits | 8 bits | + // register 0x3: | BIST | Header Type | Latency timer | Cache line size | + // + // BIST: status and control for self test of PCI devices. Always 0 for VirtIO devices + // HeaderType: 0x0 for general devices + // LatencyTimer: Latency timer in units of PCI bus clocks, 0 for VirtIO + // Cache Line size: 0 for VirtIO devices + let reg3 = locked_virtio_pci_device.read_config_register(3); + assert_eq!(reg3, 0x0); + + // register 0xa: Cardbus CIS pointer + // + // We don't emulate CardBus + let reg10 = locked_virtio_pci_device.read_config_register(0xa); + assert_eq!(reg10, 0); + + // | 16 bits | 16 bits | + // regiger 0xb: | Subsystem ID | Subsystem vendor ID| + // + // For us Subsystem ID is same as device ID and subsystem vendor ID is same as vendor ID + // (reg 0x0) + let reg11 = locked_virtio_pci_device.read_config_register(0xb); + assert_eq!(reg11, reg0); + + // register 0xc: Expansion ROM base address: 0x0 for us + let reg12 = locked_virtio_pci_device.read_config_register(0xc); + assert_eq!(reg12, 0); + + // | 24 bits | 8 bits | + // register 0xd: | Reserved | Capabilities pointer | + let reg13 = locked_virtio_pci_device.read_config_register(0xd); + assert_eq!(reg13 >> 24, 0); + + // register 0xe: Reserved + let reg14 = locked_virtio_pci_device.read_config_register(0xe); + assert_eq!(reg14, 0); + + // | 8 bits | 8 bits | 8 bits | 8 bits | + // register 0xf: | max latency | min grant | Interrupt pin | Interrupt line | + // + // We don't specify any of those + let reg15 = locked_virtio_pci_device.read_config_register(0xf); + assert_eq!(reg15, 0); + } + + #[test] + fn test_reading_bars() { + let mut vmm = default_vmm(); + vmm.device_manager.enable_pci(); + let entropy = Arc::new(Mutex::new(Entropy::new(RateLimiter::default()).unwrap())); + vmm.device_manager + .attach_virtio_device( + &vmm.vm, + "rng".to_string(), + entropy.clone(), + &mut Cmdline::new(1024).unwrap(), + false, + ) + .unwrap(); + + let device = vmm + .device_manager + .pci_devices + .get_virtio_device(entropy.lock().unwrap().device_type(), "rng") + .unwrap(); + + let mut locked_virtio_pci_device = device.lock().unwrap(); + + // According to OSdev wiki (https://wiki.osdev.org/PCI#Configuration_Space): + // + // When you want to retrieve the actual base address of a BAR, be sure to mask the lower + // bits. For 16-bit Memory Space BARs, you calculate (BAR[x] & 0xFFF0). For 32-bit Memory + // Space BARs, you calculate (BAR[x] & 0xFFFFFFF0). For 64-bit Memory Space BARs, you + // calculate ((BAR[x] & 0xFFFFFFF0) + ((BAR[x + 1] & 0xFFFFFFFF) << 32)) For I/O Space + // BARs, you calculate (BAR[x] & 0xFFFFFFFC). + + // We are allocating a single 64-bit MMIO bar for VirtIO capabilities list. As a result, we + // are using the first two BAR registers from the configuration space. + // + // The BAR address layout is as follows: + // + // | Bits 31-4 | Bit 3 | Bits 2-1 | Bit 0 | + // | 16-Byte Aligned Base Address | Prefetchable | Type | Always 0 | + // + // For 64-bit addresses though a second BAR is used to hold the upper 32 bits + // of the address. Prefetchable and type will be help in the lower bits of the + // first bar along with the lower 32-bits of the address which is always 16-bytes + // aligned. + let bar_addr_lo = locked_virtio_pci_device.read_config_register(0x4); + let bar_addr_hi = locked_virtio_pci_device.read_config_register(0x5); + let bar_addr = bar_addr_lo as u64 + ((bar_addr_hi as u64) << 32); + + // Bit 0 always 0 + assert_eq!(bar_addr & 0x1, 0); + // Type is 0x2 meaning 64-bit BAR + assert_eq!((bar_addr & 0x6) >> 1, 2); + // The actual address of the BAR should be the first available address of our 64-bit MMIO + // region + assert_eq!(bar_addr & 0xffff_ffff_ffff_fff0, MEM_64BIT_DEVICES_START); + + // Reading the BAR size is a bit more convoluted. According to OSDev wiki: + // + // To determine the amount of address space needed by a PCI device, you must save the + // original value of the BAR, write a value of all 1's to the register, then read it back. + // The amount of memory can then be determined by masking the information bits, performing + // a bitwise NOT ('~' in C), and incrementing the value by 1. + + locked_virtio_pci_device.write_config_register(0x4, 0, &[0xff, 0xff, 0xff, 0xff]); + // Read the lower size bits and mask out the last 4 bits include Prefetchable, Type and + // hardwired-0 + let bar_size_lo = locked_virtio_pci_device.read_config_register(0x4) as u64 & 0xfffffff0; + locked_virtio_pci_device.write_config_register(0x5, 0, &[0xff, 0xff, 0xff, 0xff]); + let bar_size_hi = locked_virtio_pci_device.read_config_register(0x5) as u64; + let bar_size = !((bar_size_hi << 32) | bar_size_lo) + 1; + + // We create a capabilities BAR region of 0x80000 bytes + assert_eq!(bar_size, 0x80000); + } +} diff --git a/src/vmm/src/devices/virtio/transport/pci/mod.rs b/src/vmm/src/devices/virtio/transport/pci/mod.rs new file mode 100644 index 00000000000..520b52274b3 --- /dev/null +++ b/src/vmm/src/devices/virtio/transport/pci/mod.rs @@ -0,0 +1,5 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +pub mod common_config; +pub mod device; From 46eb5f3f4070fa3ae71b98e6823236b796d49ef6 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 4 Jun 2025 10:29:18 +0200 Subject: [PATCH 43/56] seccomp: allow new ioctls for vCPU threads We are now calling KVM_CHECK_EXTENSION for checking the KVM_CAP_MSI_DEVID capability. We are also calling KVM_SET_GSI_ROUTING to set the interrupts routes and KVM_IRQFD to set/unset interrupt lines. Signed-off-by: Babis Chalios --- .../seccomp/aarch64-unknown-linux-musl.json | 43 +++++++++++++++++++ .../seccomp/x86_64-unknown-linux-musl.json | 43 +++++++++++++++++++ 2 files changed, 86 insertions(+) diff --git a/resources/seccomp/aarch64-unknown-linux-musl.json b/resources/seccomp/aarch64-unknown-linux-musl.json index db3abe1eced..e3aaeaf911b 100644 --- a/resources/seccomp/aarch64-unknown-linux-musl.json +++ b/resources/seccomp/aarch64-unknown-linux-musl.json @@ -1017,6 +1017,49 @@ { "syscall": "restart_syscall", "comment": "automatically issued by the kernel when specific timing-related syscalls (e.g. nanosleep) get interrupted by SIGSTOP" + }, + { + "syscall": "ioctl", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 44547, + "comment": "KVM_CHECK_EXTENSION" + }, + { + "index": 2, + "type": "dword", + "op": "eq", + "val": 131, + "comment": "KVM_CAP_MSI_DEVID" + } + ] + }, + { + "syscall": "ioctl", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 1074310762, + "comment": "KVM_SET_GSI_ROUTING" + } + ] + }, + { + "syscall": "ioctl", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 1075883638, + "comment": "KVM_IRQFD" + } + ] } ] } diff --git a/resources/seccomp/x86_64-unknown-linux-musl.json b/resources/seccomp/x86_64-unknown-linux-musl.json index 95ceca1b7ef..3dcdbf659d1 100644 --- a/resources/seccomp/x86_64-unknown-linux-musl.json +++ b/resources/seccomp/x86_64-unknown-linux-musl.json @@ -1149,6 +1149,49 @@ { "syscall": "restart_syscall", "comment": "automatically issued by the kernel when specific timing-related syscalls (e.g. nanosleep) get interrupted by SIGSTOP" + }, + { + "syscall": "ioctl", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 44547, + "comment": "KVM_CHECK_EXTENSION" + }, + { + "index": 2, + "type": "dword", + "op": "eq", + "val": 131, + "comment": "KVM_CAP_MSI_DEVID" + } + ] + }, + { + "syscall": "ioctl", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 1074310762, + "comment": "KVM_SET_GSI_ROUTING" + } + ] + }, + { + "syscall": "ioctl", + "args": [ + { + "index": 1, + "type": "dword", + "op": "eq", + "val": 1075883638, + "comment": "KVM_IRQFD" + } + ] } ] } From 88ca3d510bf53cdb071eb4f0c4a93b2efb6e09d9 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 4 Jun 2025 15:07:11 +0200 Subject: [PATCH 44/56] pci: add unit tests to PciSegment Add some unit tests to PciSegment. We now test that the next_device_bdf() method and the initialization logic work as expected. We also check that the configuration space of the PCI segment is correctly registered with the MMIO and, on x86, PIO bus. Signed-off-by: Babis Chalios --- src/vmm/src/devices/pci/pci_segment.rs | 97 ++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/src/vmm/src/devices/pci/pci_segment.rs b/src/vmm/src/devices/pci/pci_segment.rs index 169ffdcba3b..c1e8bb07cb8 100644 --- a/src/vmm/src/devices/pci/pci_segment.rs +++ b/src/vmm/src/devices/pci/pci_segment.rs @@ -462,3 +462,100 @@ impl Aml for PciSegment { .append_aml_bytes(v) } } + +#[cfg(test)] +mod tests { + + use super::*; + use crate::arch; + use crate::utils::u64_to_usize; + + #[test] + fn test_pci_segment_build() { + let resource_allocator = Arc::new(ResourceAllocator::new().unwrap()); + let pci_irq_slots = &[0u8; 32]; + let pci_segment = PciSegment::new(0, &resource_allocator, pci_irq_slots).unwrap(); + + assert_eq!(pci_segment.id, 0); + assert_eq!( + pci_segment.start_of_mem32_area, + arch::MEM_32BIT_DEVICES_START + ); + assert_eq!( + pci_segment.end_of_mem32_area, + arch::MEM_32BIT_DEVICES_START + arch::MEM_32BIT_DEVICES_SIZE - 1 + ); + assert_eq!( + pci_segment.start_of_mem64_area, + arch::MEM_64BIT_DEVICES_START + ); + assert_eq!( + pci_segment.end_of_mem64_area, + arch::MEM_64BIT_DEVICES_START + arch::MEM_64BIT_DEVICES_SIZE - 1 + ); + assert_eq!(pci_segment.mmio_config_address, arch::PCI_MMCONFIG_START); + assert_eq!(pci_segment.proximity_domain, 0); + assert_eq!(pci_segment.pci_devices_up, 0); + assert_eq!(pci_segment.pci_devices_down, 0); + assert_eq!(pci_segment.pci_irq_slots, [0u8; 32]); + } + + #[cfg(target_arch = "x86_64")] + #[test] + fn test_io_bus() { + let resource_allocator = Arc::new(ResourceAllocator::new().unwrap()); + let pci_irq_slots = &[0u8; 32]; + let pci_segment = PciSegment::new(0, &resource_allocator, pci_irq_slots).unwrap(); + + let mut data = [0u8; u64_to_usize(PCI_CONFIG_IO_PORT_SIZE)]; + resource_allocator + .pio_bus + .read(PCI_CONFIG_IO_PORT, &mut data) + .unwrap(); + + resource_allocator + .pio_bus + .read(PCI_CONFIG_IO_PORT + PCI_CONFIG_IO_PORT_SIZE, &mut data) + .unwrap_err(); + } + + #[test] + fn test_mmio_bus() { + let resource_allocator = Arc::new(ResourceAllocator::new().unwrap()); + let pci_irq_slots = &[0u8; 32]; + let pci_segment = PciSegment::new(0, &resource_allocator, pci_irq_slots).unwrap(); + + let mut data = [0u8; u64_to_usize(PCI_MMIO_CONFIG_SIZE_PER_SEGMENT)]; + + resource_allocator + .mmio_bus + .read(pci_segment.mmio_config_address, &mut data) + .unwrap(); + resource_allocator + .mmio_bus + .read( + pci_segment.mmio_config_address + PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, + &mut data, + ) + .unwrap_err(); + } + + #[test] + fn test_next_device_bdf() { + let resource_allocator = Arc::new(ResourceAllocator::new().unwrap()); + let pci_irq_slots = &[0u8; 32]; + let pci_segment = PciSegment::new(0, &resource_allocator, pci_irq_slots).unwrap(); + + // Start checking from device id 1, since 0 is allocated to the Root port. + for dev_id in 1..32 { + let bdf = pci_segment.next_device_bdf().unwrap(); + // In our case we have a single Segment with id 0, which has + // a single bus with id 0. Also, each device of ours has a + // single function. + assert_eq!(bdf, PciBdf::new(0, 0, dev_id, 0)); + } + + // We can only have 32 devices on a segment + pci_segment.next_device_bdf().unwrap_err(); + } +} From f8942498777fc2b0c8bd38de1b98405aecea7ed4 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Thu, 5 Jun 2025 15:34:45 +0200 Subject: [PATCH 45/56] device_manager: save resource allocator in snapshot vm-allocator now allows us to (De)serialize IdAllocator and AddressAllocator types. Add ResourceAllocator in DeviceManager snapshot state and restore it when loading a snapshot. Like this we can avoid doing the ExactMatch allocations during snapshot resumes for reserving the exact same MMIO ranges. Moreover, change DeviceManager and PciDevices to provide save/restore functionality via the Persist trait. Like that we can avoid first creating the objects and then restoring their state, overwriting their fields. Signed-off-by: Babis Chalios --- Cargo.lock | 1 + src/vmm/Cargo.toml | 2 +- src/vmm/src/builder.rs | 22 ++-- src/vmm/src/device_manager/mod.rs | 158 +++++++++++++++--------- src/vmm/src/device_manager/pci_mngr.rs | 49 +++++--- src/vmm/src/device_manager/persist.rs | 38 ++---- src/vmm/src/device_manager/resources.rs | 130 ++++++++++++++++++- src/vmm/src/devices/acpi/vmgenid.rs | 5 - src/vmm/src/lib.rs | 1 + src/vmm/src/persist.rs | 1 + 10 files changed, 283 insertions(+), 124 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d0873fb7346..e1b6f10897c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1620,6 +1620,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "040a65b0c29f298d71ca45dd52d02b0d0ddc15b9b97d95dfeebe67d6fdd42a28" dependencies = [ "libc", + "serde", "thiserror 2.0.12", ] diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index 688d44b660d..c89ad283474 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -52,7 +52,7 @@ userfaultfd = "0.8.1" utils = { path = "../utils" } uuid = "1.16.0" vhost = { version = "0.14.0", features = ["vhost-user-frontend"] } -vm-allocator = "0.1.3" +vm-allocator = { version = "0.1.3", features = ["serde"] } vm-device = { path = "../vm-device" } vm-memory = { version = "0.16.2", features = [ "backend-mmap", diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 5a255f5cf7b..3284b11f559 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -42,6 +42,7 @@ use crate::logger::debug; use crate::persist::{MicrovmState, MicrovmStateError}; use crate::resources::VmResources; use crate::seccomp::BpfThreadMap; +use crate::snapshot::Persist; use crate::vmm_config::instance_info::InstanceInfo; use crate::vmm_config::machine_config::MachineConfigError; use crate::vstate::kvm::{Kvm, KvmError}; @@ -411,8 +412,6 @@ pub fn build_microvm_from_snapshot( .create_vcpus(vm_resources.machine_config.vcpu_count) .map_err(StartMicrovmError::Vm)?; - let mut device_manager = DeviceManager::new(event_manager, &vcpus_exit_evt, &vm).unwrap(); - vm.register_memory_regions(guest_memory) .map_err(StartMicrovmError::Vm)?; @@ -430,16 +429,6 @@ pub fn build_microvm_from_snapshot( } } - // Restore allocator state - #[cfg(target_arch = "aarch64")] - if let Some(pvtime_ipa) = vcpus[0].kvm_vcpu.pvtime_ipa { - allocate_pvtime_region( - &mut device_manager, - vcpus.len(), - vm_allocator::AllocPolicy::ExactMatch(pvtime_ipa.0), - )?; - } - // Restore vcpus kvm state. for (vcpu, state) in vcpus.iter_mut().zip(microvm_state.vcpu_states.iter()) { vcpu.kvm_vcpu @@ -463,6 +452,9 @@ pub fn build_microvm_from_snapshot( vm_resources.boot_source.config = microvm_state.vm_info.boot_source; // Restore devices states. + // Restoring VMGenID injects an interrupt in the guest to notify it about the new generation + // ID. As a result, we need to restore DeviceManager after restoring the KVM state, otherwise + // the injected interrupt will be overwritten. let device_ctor_args = DeviceRestoreArgs { mem: vm.guest_memory(), vm: &vm, @@ -470,9 +462,11 @@ pub fn build_microvm_from_snapshot( vm_resources, instance_id: &instance_info.id, restored_from_file: uffd.is_none(), + vcpus_exit_evt: &vcpus_exit_evt, }; - - device_manager.restore(µvm_state.device_states, device_ctor_args)?; + #[allow(unused_mut)] + let mut device_manager = + DeviceManager::restore(device_ctor_args, µvm_state.device_states)?; let mut vmm = Vmm { events_observer: Some(std::io::stdin()), diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index da61db922c3..e60d64394e8 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -15,7 +15,7 @@ use legacy::{LegacyDeviceError, PortIODeviceManager}; use linux_loader::loader::Cmdline; use log::error; use mmio::{MMIODeviceManager, MmioError}; -use pci_mngr::{PciDevices, PciManagerError}; +use pci_mngr::{PciDevices, PciDevicesConstructorArgs, PciManagerError}; use persist::{ACPIDeviceManagerConstructorArgs, MMIODevManagerConstructorArgs}; use resources::ResourceAllocator; use serde::{Deserialize, Serialize}; @@ -127,30 +127,39 @@ impl DeviceManager { Ok(serial) } + #[cfg(target_arch = "x86_64")] + fn create_legacy_devices( + event_manager: &mut EventManager, + vcpus_exit_evt: &EventFd, + vm: &Vm, + resource_allocator: &ResourceAllocator, + ) -> Result { + Self::set_stdout_nonblocking(); + + // Create serial device + let serial = Self::setup_serial_device(event_manager)?; + let reset_evt = vcpus_exit_evt + .try_clone() + .map_err(DeviceManagerCreateError::EventFd)?; + // Create keyboard emulator for reset event + let i8042 = Arc::new(Mutex::new(I8042Device::new(reset_evt)?)); + + // create pio dev manager with legacy devices + let mut legacy_devices = PortIODeviceManager::new(serial, i8042)?; + legacy_devices.register_devices(&resource_allocator.pio_bus, vm)?; + Ok(legacy_devices) + } + #[cfg_attr(target_arch = "aarch64", allow(unused))] pub fn new( event_manager: &mut EventManager, - vcpu_exit_evt: &EventFd, + vcpus_exit_evt: &EventFd, vm: &Vm, ) -> Result { let resource_allocator = Arc::new(ResourceAllocator::new()?); #[cfg(target_arch = "x86_64")] - let legacy_devices = { - Self::set_stdout_nonblocking(); - - // Create serial device - let serial = Self::setup_serial_device(event_manager)?; - let reset_evt = vcpu_exit_evt - .try_clone() - .map_err(DeviceManagerCreateError::EventFd)?; - // Create keyboard emulator for reset event - let i8042 = Arc::new(Mutex::new(I8042Device::new(reset_evt)?)); - - // create pio dev manager with legacy devices - let mut legacy_devices = PortIODeviceManager::new(serial, i8042)?; - legacy_devices.register_devices(&resource_allocator.pio_bus, vm)?; - legacy_devices - }; + let legacy_devices = + Self::create_legacy_devices(event_manager, vcpus_exit_evt, vm, &resource_allocator)?; Ok(DeviceManager { resource_allocator, @@ -270,6 +279,8 @@ impl DeviceManager { #[derive(Debug, Default, Clone, Serialize, Deserialize)] /// State of devices in the system pub struct DevicesState { + /// Resource allocator state + pub resource_allocator_state: resources::ResourceAllocatorState, /// MMIO devices state pub mmio_state: persist::DeviceStates, /// ACPI devices state @@ -292,12 +303,15 @@ pub enum DevicePersistError { SerialRestore(#[from] EmulateSerialInitError), /// Error inserting device in bus: {0} Bus(#[from] vm_device::BusError), + /// Error creating DeviceManager: {0} + DeviceManager(#[from] DeviceManagerCreateError), } pub struct DeviceRestoreArgs<'a> { pub mem: &'a GuestMemoryMmap, pub vm: &'a Vm, pub event_manager: &'a mut EventManager, + pub vcpus_exit_evt: &'a EventFd, pub vm_resources: &'a mut VmResources, pub instance_id: &'a str, pub restored_from_file: bool, @@ -315,15 +329,82 @@ impl std::fmt::Debug for DeviceRestoreArgs<'_> { } } -impl DeviceManager { - pub fn save(&self) -> DevicesState { +impl<'a> Persist<'a> for DeviceManager { + type State = DevicesState; + type ConstructorArgs = DeviceRestoreArgs<'a>; + type Error = DevicePersistError; + + fn save(&self) -> Self::State { DevicesState { + resource_allocator_state: self.resource_allocator.save(), mmio_state: self.mmio_devices.save(), acpi_state: self.acpi_devices.save(), pci_state: self.pci_devices.save(), } } + fn restore( + constructor_args: Self::ConstructorArgs, + state: &Self::State, + ) -> std::result::Result { + // Safe to unwrap here. ResourceAllocator restoring cannot fail. + let resource_allocator = + Arc::new(ResourceAllocator::restore((), &state.resource_allocator_state).unwrap()); + + // Setup legacy devices in case of x86 + #[cfg(target_arch = "x86_64")] + let legacy_devices = Self::create_legacy_devices( + constructor_args.event_manager, + constructor_args.vcpus_exit_evt, + constructor_args.vm, + &resource_allocator, + )?; + + // Restore MMIO devices + let mmio_ctor_args = MMIODevManagerConstructorArgs { + mem: constructor_args.mem, + vm: constructor_args.vm, + event_manager: constructor_args.event_manager, + resource_allocator: &resource_allocator, + vm_resources: constructor_args.vm_resources, + instance_id: constructor_args.instance_id, + restored_from_file: constructor_args.restored_from_file, + }; + let mmio_devices = MMIODeviceManager::restore(mmio_ctor_args, &state.mmio_state)?; + + // Restore ACPI devices + let acpi_ctor_args = ACPIDeviceManagerConstructorArgs { + mem: constructor_args.mem, + resource_allocator: &resource_allocator, + vm: constructor_args.vm, + }; + let mut acpi_devices = ACPIDeviceManager::restore(acpi_ctor_args, &state.acpi_state)?; + acpi_devices.notify_vmgenid()?; + + // Restore PCI devices + let pci_ctor_args = PciDevicesConstructorArgs { + resource_allocator: &resource_allocator, + }; + let pci_devices = PciDevices::restore(pci_ctor_args, &state.pci_state)?; + + let device_manager = DeviceManager { + resource_allocator, + mmio_devices, + #[cfg(target_arch = "x86_64")] + legacy_devices, + acpi_devices, + pci_devices, + }; + + // Restore serial. + // We need to do that after we restore mmio devices, otherwise it won't succeed in Aarch64 + device_manager.emulate_serial_init()?; + + Ok(device_manager) + } +} + +impl DeviceManager { /// Sets RDA bit in serial console pub fn emulate_serial_init(&self) -> Result<(), EmulateSerialInitError> { // When restoring from a previously saved state, there is no serial @@ -361,43 +442,6 @@ impl DeviceManager { Ok(()) } } - - pub fn restore( - &mut self, - state: &DevicesState, - restore_args: DeviceRestoreArgs, - ) -> Result<(), DevicePersistError> { - // Restore MMIO devices - let mmio_ctor_args = MMIODevManagerConstructorArgs { - mem: restore_args.mem, - vm: restore_args.vm, - event_manager: restore_args.event_manager, - resource_allocator: &self.resource_allocator, - vm_resources: restore_args.vm_resources, - instance_id: restore_args.instance_id, - restored_from_file: restore_args.restored_from_file, - }; - self.mmio_devices = MMIODeviceManager::restore(mmio_ctor_args, &state.mmio_state)?; - - // Restore serial. - // We need to do that after we restore mmio devices, otherwise it won't succeed in Aarch64 - self.emulate_serial_init()?; - - // Restore ACPI devices - let acpi_ctor_args = ACPIDeviceManagerConstructorArgs { - mem: restore_args.mem, - resource_allocator: &self.resource_allocator, - vm: restore_args.vm, - }; - self.acpi_devices = ACPIDeviceManager::restore(acpi_ctor_args, &state.acpi_state)?; - self.acpi_devices.notify_vmgenid()?; - - // Restore PCI devices - self.pci_devices - .restore(&state.pci_state, &self.resource_allocator)?; - - Ok(()) - } } #[cfg(test)] diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs index 686349858fb..70bb03388f6 100644 --- a/src/vmm/src/device_manager/pci_mngr.rs +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -16,6 +16,7 @@ use crate::device_manager::resources::ResourceAllocator; use crate::devices::pci::PciSegment; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::transport::pci::device::{VirtioPciDevice, VirtioPciDeviceError}; +use crate::snapshot::Persist; use crate::vstate::vm::InterruptError; #[derive(Debug, Default)] @@ -65,24 +66,6 @@ impl PciDevices { Ok(()) } - pub fn save(&self) -> PciDevicesState { - PciDevicesState { - pci_enabled: self.pci_segment.is_some(), - } - } - - pub fn restore( - &mut self, - state: &PciDevicesState, - resource_allocator: &Arc, - ) -> Result<(), PciManagerError> { - if state.pci_enabled { - self.attach_pci_segment(resource_allocator)?; - } - - Ok(()) - } - fn register_bars_with_bus( resource_allocator: &ResourceAllocator, virtio_device: &Arc>, @@ -194,3 +177,33 @@ impl PciDevices { pub struct PciDevicesState { pci_enabled: bool, } + +#[derive(Debug)] +pub struct PciDevicesConstructorArgs<'a> { + pub resource_allocator: &'a Arc, +} + +impl<'a> Persist<'a> for PciDevices { + type State = PciDevicesState; + type ConstructorArgs = PciDevicesConstructorArgs<'a>; + type Error = PciManagerError; + + fn save(&self) -> Self::State { + PciDevicesState { + pci_enabled: self.pci_segment.is_some(), + } + } + + fn restore( + constructor_args: Self::ConstructorArgs, + state: &Self::State, + ) -> std::result::Result { + let mut pci_devices = PciDevices::new(); + + if state.pci_enabled { + pci_devices.attach_pci_segment(constructor_args.resource_allocator)?; + } + + Ok(pci_devices) + } +} diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 6b1168ec965..1952fdaee40 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -9,7 +9,6 @@ use std::sync::{Arc, Mutex}; use event_manager::{MutEventSubscriber, SubscriberOps}; use log::{error, warn}; use serde::{Deserialize, Serialize}; -use vm_allocator::AllocPolicy; use super::acpi::ACPIDeviceManager; use super::mmio::*; @@ -471,27 +470,6 @@ impl<'a> Persist<'a> for MMIODeviceManager { .map_err(|()| DevicePersistError::MmioTransport)?, )); - // We do not currently require exact re-allocation of IDs via - // `dev_manager.irq_allocator.allocate_id()` and currently cannot do - // this effectively as `IdAllocator` does not implement an exact - // match API. - // In the future we may require preserving `IdAllocator`'s state - // after snapshot restore so as to restore the exact interrupt IDs - // from the original device's state for implementing hot-plug. - // For now this is why we do not restore the state of the - // `IdAllocator` under `dev_manager`. - - constructor_args - .resource_allocator - .allocate_32bit_mmio_memory( - MMIO_LEN, - MMIO_LEN, - AllocPolicy::ExactMatch(device_info.addr), - ) - .map_err(|e| { - DevicePersistError::DeviceManager(super::mmio::MmioError::Allocator(e)) - })?; - dev_manager.register_mmio_virtio( vm, id.clone(), @@ -678,6 +656,7 @@ mod tests { use super::*; use crate::builder::tests::*; + use crate::device_manager; use crate::devices::virtio::block::CacheType; use crate::resources::VmmConfig; use crate::snapshot::Snapshot; @@ -748,11 +727,10 @@ mod tests { #[test] fn test_device_manager_persistence() { - let mut buf = vec![0; 16384]; + let mut buf = vec![0; 65536]; // These need to survive so the restored blocks find them. let _block_files; let mut tmp_sock_file = TempFile::new().unwrap(); - let resource_allocator = ResourceAllocator::new().unwrap(); tmp_sock_file.remove().unwrap(); // Set up a vmm with one of each device, and get the serialized DeviceStates. { @@ -812,7 +790,10 @@ mod tests { let mut event_manager = EventManager::new().expect("Unable to create EventManager"); let vmm = default_vmm(); - let device_states: DeviceStates = Snapshot::deserialize(&mut buf.as_slice()).unwrap(); + let device_manager_state: device_manager::DevicesState = + Snapshot::deserialize(&mut buf.as_slice()).unwrap(); + let resource_allocator = + ResourceAllocator::restore((), &device_manager_state.resource_allocator_state).unwrap(); let vm_resources = &mut VmResources::default(); let restore_args = MMIODevManagerConstructorArgs { mem: vmm.vm.guest_memory(), @@ -824,7 +805,7 @@ mod tests { restored_from_file: true, }; let _restored_dev_manager = - MMIODeviceManager::restore(restore_args, &device_states).unwrap(); + MMIODeviceManager::restore(restore_args, &device_manager_state.mmio_state).unwrap(); let expected_vm_resources = format!( r#"{{ @@ -899,7 +880,10 @@ mod tests { .version(), MmdsVersion::V2 ); - assert_eq!(device_states.mmds_version.unwrap(), MmdsVersion::V2.into()); + assert_eq!( + device_manager_state.mmio_state.mmds_version.unwrap(), + MmdsVersion::V2.into() + ); assert_eq!( expected_vm_resources, serde_json::to_string_pretty(&VmmConfig::from(&*vm_resources)).unwrap() diff --git a/src/vmm/src/device_manager/resources.rs b/src/vmm/src/device_manager/resources.rs index 249d0507ba8..f7035e55566 100644 --- a/src/vmm/src/device_manager/resources.rs +++ b/src/vmm/src/device_manager/resources.rs @@ -1,14 +1,17 @@ // Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +use std::convert::Infallible; use std::sync::{Arc, Mutex}; use pci::DeviceRelocation; +use serde::{Deserialize, Serialize}; pub use vm_allocator::AllocPolicy; use vm_allocator::{AddressAllocator, IdAllocator}; use vm_device::Bus; use crate::arch; +use crate::snapshot::Persist; /// A resource manager for (de)allocating interrupt lines (GSIs) and guest memory /// @@ -152,6 +155,69 @@ impl ResourceAllocator { } } +impl<'a> Persist<'a> for ResourceAllocator { + type State = ResourceAllocatorState; + type ConstructorArgs = (); + type Error = Infallible; + + fn save(&self) -> Self::State { + ResourceAllocatorState { + gsi_allocator: self.gsi_allocator.clone(), + mmio32_memory: self.mmio32_memory.clone(), + mmio64_memory: self.mmio64_memory.clone(), + system_memory: self.system_memory.clone(), + } + } + + fn restore( + _constructor_args: Self::ConstructorArgs, + state: &Self::State, + ) -> std::result::Result { + Ok(ResourceAllocator { + gsi_allocator: state.gsi_allocator.clone(), + mmio32_memory: state.mmio32_memory.clone(), + mmio64_memory: state.mmio64_memory.clone(), + system_memory: state.system_memory.clone(), + mmio_bus: Arc::new(Bus::new()), + #[cfg(target_arch = "x86_64")] + pio_bus: Arc::new(Bus::new()), + }) + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ResourceAllocatorState { + // Allocator for device interrupt lines + pub gsi_allocator: Arc>, + // Allocator for memory in the 32-bit MMIO address space + pub mmio32_memory: Arc>, + // Allocator for memory in the 64-bit MMIO address space + pub mmio64_memory: Arc>, + // Memory allocator for system data + pub system_memory: Arc>, +} + +impl Default for ResourceAllocatorState { + fn default() -> Self { + Self { + gsi_allocator: Arc::new(Mutex::new( + IdAllocator::new(arch::IRQ_BASE, arch::IRQ_MAX).unwrap(), + )), + mmio32_memory: Arc::new(Mutex::new( + AddressAllocator::new(arch::MEM_32BIT_DEVICES_START, arch::MEM_32BIT_DEVICES_SIZE) + .unwrap(), + )), + mmio64_memory: Arc::new(Mutex::new( + AddressAllocator::new(arch::MEM_64BIT_DEVICES_START, arch::MEM_64BIT_DEVICES_SIZE) + .unwrap(), + )), + system_memory: Arc::new(Mutex::new( + AddressAllocator::new(arch::SYSTEM_MEM_START, arch::SYSTEM_MEM_SIZE).unwrap(), + )), + } + } +} + impl DeviceRelocation for ResourceAllocator { fn move_bar( &self, @@ -167,8 +233,11 @@ impl DeviceRelocation for ResourceAllocator { #[cfg(test)] mod tests { - use super::ResourceAllocator; - use crate::arch; + use vm_allocator::AllocPolicy; + + use super::{ResourceAllocator, ResourceAllocatorState}; + use crate::arch::{self, IRQ_BASE}; + use crate::snapshot::{Persist, Snapshot}; const MAX_IRQS: u32 = arch::IRQ_MAX - arch::IRQ_BASE + 1; @@ -210,4 +279,61 @@ mod tests { assert_eq!(allocator.allocate_gsi(1), Ok(vec![i])); } } + + fn clone_allocator(allocator: &ResourceAllocator) -> ResourceAllocator { + let mut buf = vec![0u8; 1024]; + Snapshot::serialize(&mut buf.as_mut_slice(), &allocator.save()).unwrap(); + let restored_state: ResourceAllocatorState = + Snapshot::deserialize(&mut buf.as_slice()).unwrap(); + ResourceAllocator::restore((), &restored_state).unwrap() + } + + #[test] + fn test_save_restore() { + let allocator0 = ResourceAllocator::new().unwrap(); + let gsi_0 = allocator0.allocate_gsi(1).unwrap()[0]; + assert_eq!(gsi_0, IRQ_BASE); + + let allocator1 = clone_allocator(&allocator0); + let gsi_1 = allocator1.allocate_gsi(1).unwrap()[0]; + assert_eq!(gsi_1, IRQ_BASE + 1); + let mmio32_mem = allocator1 + .allocate_32bit_mmio_memory(0x42, 1, AllocPolicy::FirstMatch) + .unwrap(); + assert_eq!(mmio32_mem, arch::MEM_32BIT_DEVICES_START); + let mmio64_mem = allocator1 + .allocate_64bit_mmio_memory(0x42, 1, AllocPolicy::FirstMatch) + .unwrap(); + assert_eq!(mmio64_mem, arch::MEM_64BIT_DEVICES_START); + let system_mem = allocator1 + .allocate_system_memory(0x42, 1, AllocPolicy::FirstMatch) + .unwrap(); + assert_eq!(system_mem, arch::SYSTEM_MEM_START); + + let allocator2 = clone_allocator(&allocator1); + allocator2 + .allocate_32bit_mmio_memory(0x42, 1, AllocPolicy::ExactMatch(mmio32_mem)) + .unwrap_err(); + allocator2 + .allocate_64bit_mmio_memory(0x42, 1, AllocPolicy::ExactMatch(mmio64_mem)) + .unwrap_err(); + allocator2 + .allocate_system_memory(0x42, 1, AllocPolicy::ExactMatch(system_mem)) + .unwrap_err(); + + let gsi_2 = allocator2.allocate_gsi(1).unwrap()[0]; + assert_eq!(gsi_2, IRQ_BASE + 2); + let mmio32_mem = allocator1 + .allocate_32bit_mmio_memory(0x42, 1, AllocPolicy::FirstMatch) + .unwrap(); + assert_eq!(mmio32_mem, arch::MEM_32BIT_DEVICES_START + 0x42); + let mmio64_mem = allocator1 + .allocate_64bit_mmio_memory(0x42, 1, AllocPolicy::FirstMatch) + .unwrap(); + assert_eq!(mmio64_mem, arch::MEM_64BIT_DEVICES_START + 0x42); + let system_mem = allocator1 + .allocate_system_memory(0x42, 1, AllocPolicy::FirstMatch) + .unwrap(); + assert_eq!(system_mem, arch::SYSTEM_MEM_START + 0x42); + } } diff --git a/src/vmm/src/devices/acpi/vmgenid.rs b/src/vmm/src/devices/acpi/vmgenid.rs index df0656bfbcc..0cf0ae0d7b1 100644 --- a/src/vmm/src/devices/acpi/vmgenid.rs +++ b/src/vmm/src/devices/acpi/vmgenid.rs @@ -152,11 +152,6 @@ impl<'a> Persist<'a> for VmGenId { constructor_args: Self::ConstructorArgs, state: &Self::State, ) -> std::result::Result { - constructor_args.resource_allocator.allocate_system_memory( - VMGENID_MEM_SIZE, - 8, - vm_allocator::AllocPolicy::ExactMatch(state.addr), - )?; Self::from_parts(GuestAddress(state.addr), state.gsi, constructor_args.mem) } } diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 18177367ada..b3efc12a500 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -126,6 +126,7 @@ use devices::acpi::vmgenid::VmGenIdError; use devices::virtio::device::VirtioDevice; use event_manager::{EventManager as BaseEventManager, EventOps, Events, MutEventSubscriber}; use seccomp::BpfProgram; +use snapshot::Persist; use userfaultfd::Uffd; use vmm_sys_util::epoll::EventSet; use vmm_sys_util::eventfd::EventFd; diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index b8e336f1dad..479a8d75e03 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -599,6 +599,7 @@ mod tests { #[cfg(target_arch = "aarch64")] use crate::construct_kvm_mpidrs; use crate::devices::virtio::block::CacheType; + use crate::snapshot::Persist; use crate::vmm_config::balloon::BalloonDeviceConfig; use crate::vmm_config::net::NetworkInterfaceConfig; use crate::vmm_config::vsock::tests::default_config; From 3f9784092b280800786960a006e43fa1929cde29 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Fri, 6 Jun 2025 13:36:38 +0200 Subject: [PATCH 46/56] refactor: VirtIO MMIO persistence logic VirtIO MMIO restore logic activates the device the moment we restore the device state, if the device was activated when snapshotted. Move the activation responsibility to the logic the restores the MMIO transport. The reason for this change is that that's how it will be done for the PCI transport. Unifying this will allow us reusing the same types for restoring the non-transport state of devices. Note that we needed to change the way Net devices are saved/restored. RxBuffer type of Net devices holds RX descriptors that we have parsed from the Queue ahead of time. The way we restored this info was manipulating the queue to re-parse the RX descriptors during the restore phase. However, we need the device to be activated to do so, which now isn't. So, instead of storing this info inside the snapshot make sure we have flushed everything before taking the snapshot. Also, simplify a bit the types that we use for serializing/deserializing the state of a device. Signed-off-by: Babis Chalios --- src/vmm/src/device_manager/mmio.rs | 80 -------- src/vmm/src/device_manager/mod.rs | 109 +++++++++- src/vmm/src/device_manager/persist.rs | 191 ++++++------------ src/vmm/src/devices/virtio/balloon/persist.rs | 36 ++-- src/vmm/src/devices/virtio/block/persist.rs | 10 +- .../devices/virtio/block/virtio/persist.rs | 18 +- src/vmm/src/devices/virtio/net/device.rs | 21 ++ src/vmm/src/devices/virtio/net/persist.rs | 48 +---- src/vmm/src/devices/virtio/rng/persist.rs | 16 +- src/vmm/src/devices/virtio/vsock/persist.rs | 14 +- src/vmm/src/lib.rs | 2 +- src/vmm/src/persist.rs | 15 +- 12 files changed, 224 insertions(+), 336 deletions(-) diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 5031e3104ba..2d6cde39c52 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -16,7 +16,6 @@ use kvm_ioctls::IoEventAddress; use linux_loader::cmdline as kernel_cmdline; #[cfg(target_arch = "x86_64")] use log::debug; -use log::info; use serde::{Deserialize, Serialize}; use vm_allocator::AllocPolicy; @@ -28,14 +27,8 @@ use crate::arch::{RTC_MEM_START, SERIAL_MEM_START}; #[cfg(target_arch = "aarch64")] use crate::devices::legacy::{RTCDevice, SerialDevice}; use crate::devices::pseudo::BootTimer; -use crate::devices::virtio::balloon::Balloon; -use crate::devices::virtio::block::device::Block; use crate::devices::virtio::device::VirtioDevice; -use crate::devices::virtio::net::Net; -use crate::devices::virtio::rng::Entropy; use crate::devices::virtio::transport::mmio::MmioTransport; -use crate::devices::virtio::vsock::{TYPE_VSOCK, Vsock, VsockUnixBackend}; -use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET, TYPE_RNG}; #[cfg(target_arch = "x86_64")] use crate::vstate::memory::GuestAddress; @@ -442,79 +435,6 @@ impl MMIODeviceManager { Ok(()) } - /// Artificially kick devices as if they had external events. - pub fn kick_devices(&self) { - info!("Artificially kick devices."); - // We only kick virtio devices for now. - let _: Result<(), MmioError> = self.for_each_virtio_device(|virtio_type, id, device| { - let mmio_transport_locked = device.inner.lock().expect("Poisoned locked"); - let mut virtio = mmio_transport_locked.locked_device(); - match *virtio_type { - TYPE_BALLOON => { - let balloon = virtio.as_mut_any().downcast_mut::().unwrap(); - // If device is activated, kick the balloon queue(s) to make up for any - // pending or in-flight epoll events we may have not captured in snapshot. - // Stats queue doesn't need kicking as it is notified via a `timer_fd`. - if balloon.is_activated() { - info!("kick balloon {}.", id); - balloon.process_virtio_queues().unwrap(); - } - } - TYPE_BLOCK => { - // We only care about kicking virtio block. - // If we need to kick vhost-user-block we can do nothing. - if let Some(block) = virtio.as_mut_any().downcast_mut::() { - // If device is activated, kick the block queue(s) to make up for any - // pending or in-flight epoll events we may have not captured in - // snapshot. No need to kick Ratelimiters - // because they are restored 'unblocked' so - // any inflight `timer_fd` events can be safely discarded. - if block.is_activated() { - info!("kick block {}.", id); - block.process_virtio_queues().unwrap(); - } - } - } - TYPE_NET => { - let net = virtio.as_mut_any().downcast_mut::().unwrap(); - // If device is activated, kick the net queue(s) to make up for any - // pending or in-flight epoll events we may have not captured in snapshot. - // No need to kick Ratelimiters because they are restored 'unblocked' so - // any inflight `timer_fd` events can be safely discarded. - if net.is_activated() { - info!("kick net {}.", id); - net.process_virtio_queues().unwrap(); - } - } - TYPE_VSOCK => { - // Vsock has complicated protocol that isn't resilient to any packet loss, - // so for Vsock we don't support connection persistence through snapshot. - // Any in-flight packets or events are simply lost. - // Vsock is restored 'empty'. - // The only reason we still `kick` it is to make guest process - // `TRANSPORT_RESET_EVENT` event we sent during snapshot creation. - let vsock = virtio - .as_mut_any() - .downcast_mut::>() - .unwrap(); - if vsock.is_activated() { - info!("kick vsock {id}."); - vsock.signal_used_queue(0).unwrap(); - } - } - TYPE_RNG => { - let entropy = virtio.as_mut_any().downcast_mut::().unwrap(); - if entropy.is_activated() { - info!("kick entropy {id}."); - entropy.process_virtio_queues().unwrap(); - } - } - _ => (), - } - Ok(()) - }); - } - #[cfg(target_arch = "aarch64")] pub fn virtio_device_info(&self) -> Vec<&MMIODeviceInfo> { let mut device_info = Vec::new(); diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index e60d64394e8..95e04111b13 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -5,6 +5,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the THIRD-PARTY file. +use std::convert::Infallible; use std::fmt::Debug; use std::sync::{Arc, Mutex}; @@ -13,7 +14,7 @@ use event_manager::{MutEventSubscriber, SubscriberOps}; #[cfg(target_arch = "x86_64")] use legacy::{LegacyDeviceError, PortIODeviceManager}; use linux_loader::loader::Cmdline; -use log::error; +use log::{error, info}; use mmio::{MMIODeviceManager, MmioError}; use pci_mngr::{PciDevices, PciDevicesConstructorArgs, PciManagerError}; use persist::{ACPIDeviceManagerConstructorArgs, MMIODevManagerConstructorArgs}; @@ -30,8 +31,14 @@ use crate::devices::legacy::RTCDevice; use crate::devices::legacy::serial::SerialOut; use crate::devices::legacy::{IER_RDA_BIT, IER_RDA_OFFSET, SerialDevice}; use crate::devices::pseudo::BootTimer; +use crate::devices::virtio::balloon::Balloon; +use crate::devices::virtio::block::device::Block; use crate::devices::virtio::device::VirtioDevice; +use crate::devices::virtio::net::Net; +use crate::devices::virtio::rng::Entropy; use crate::devices::virtio::transport::mmio::{IrqTrigger, MmioTransport}; +use crate::devices::virtio::vsock::{TYPE_VSOCK, Vsock, VsockUnixBackend}; +use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET, TYPE_RNG}; use crate::resources::VmResources; use crate::snapshot::Persist; use crate::vstate::memory::GuestMemoryMmap; @@ -274,6 +281,106 @@ impl DeviceManager { self.pci_devices .attach_pci_segment(&self.resource_allocator) } + + fn do_kick_device(virtio_device: Arc>) { + let mut device = virtio_device.lock().expect("Poisoned lock"); + match device.device_type() { + TYPE_BALLOON => { + let balloon = device.as_mut_any().downcast_mut::().unwrap(); + // If device is activated, kick the balloon queue(s) to make up for any + // pending or in-flight epoll events we may have not captured in snapshot. + // Stats queue doesn't need kicking as it is notified via a `timer_fd`. + if balloon.is_activated() { + info!("kick balloon {}.", balloon.id()); + balloon.process_virtio_queues().unwrap(); + } + } + TYPE_BLOCK => { + // We only care about kicking virtio block. + // If we need to kick vhost-user-block we can do nothing. + if let Some(block) = device.as_mut_any().downcast_mut::() { + // If device is activated, kick the block queue(s) to make up for any + // pending or in-flight epoll events we may have not captured in + // snapshot. No need to kick Ratelimiters + // because they are restored 'unblocked' so + // any inflight `timer_fd` events can be safely discarded. + if block.is_activated() { + info!("kick block {}.", block.id()); + block.process_virtio_queues().unwrap(); + } + } + } + TYPE_NET => { + let net = device.as_mut_any().downcast_mut::().unwrap(); + // If device is activated, kick the net queue(s) to make up for any + // pending or in-flight epoll events we may have not captured in snapshot. + // No need to kick Ratelimiters because they are restored 'unblocked' so + // any inflight `timer_fd` events can be safely discarded. + if net.is_activated() { + info!("kick net {}.", net.id()); + net.process_virtio_queues().unwrap(); + } + } + TYPE_VSOCK => { + // Vsock has complicated protocol that isn't resilient to any packet loss, + // so for Vsock we don't support connection persistence through snapshot. + // Any in-flight packets or events are simply lost. + // Vsock is restored 'empty'. + // The only reason we still `kick` it is to make guest process + // `TRANSPORT_RESET_EVENT` event we sent during snapshot creation. + let vsock = device + .as_mut_any() + .downcast_mut::>() + .unwrap(); + if vsock.is_activated() { + info!("kick vsock {}.", vsock.id()); + vsock.signal_used_queue(0).unwrap(); + } + } + TYPE_RNG => { + let entropy = device.as_mut_any().downcast_mut::().unwrap(); + if entropy.is_activated() { + info!("kick entropy {}.", entropy.id()); + entropy.process_virtio_queues().unwrap(); + } + } + _ => (), + } + } + + /// Artificially kick VirtIO devices as if they had external events. + pub fn kick_virtio_devices(&self) { + info!("Artificially kick devices"); + // Go through MMIO VirtIO devices + let _: Result<(), MmioError> = self.mmio_devices.for_each_virtio_device(|_, _, device| { + let mmio_transport_locked = device.inner.lock().expect("Poisoned lock"); + Self::do_kick_device(mmio_transport_locked.device()); + Ok(()) + }); + } + + fn do_mark_virtio_queue_memory_dirty( + device: Arc>, + mem: &GuestMemoryMmap, + ) { + // SAFETY: + // This should never fail as we mark pages only if device has already been activated, + // and the address validation was already performed on device activation. + let mut locked_device = device.lock().expect("Poisoned lock"); + if locked_device.is_activated() { + locked_device.mark_queue_memory_dirty(mem).unwrap() + } + } + + /// Mark queue memory dirty for activated VirtIO devices + pub fn mark_virtio_queue_memory_dirty(&self, mem: &GuestMemoryMmap) { + // Go through MMIO VirtIO devices + let _: Result<(), Infallible> = self.mmio_devices.for_each_virtio_device(|_, _, device| { + let mmio_transport_locked = device.inner.lock().expect("Poisoned locked"); + Self::do_mark_virtio_queue_memory_dirty(mmio_transport_locked.device(), mem); + Ok(()) + }); + } } #[derive(Debug, Default, Clone, Serialize, Deserialize)] diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 1952fdaee40..93385805e7b 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -42,7 +42,7 @@ use crate::devices::virtio::vsock::persist::{ use crate::devices::virtio::vsock::{ TYPE_VSOCK, Vsock, VsockError, VsockUnixBackend, VsockUnixBackendError, }; -use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET, TYPE_RNG}; +use crate::devices::virtio::{ActivateError, TYPE_BALLOON, TYPE_BLOCK, TYPE_NET, TYPE_RNG}; use crate::mmds::data_store::MmdsVersion; use crate::resources::{ResourcesError, VmResources}; use crate::snapshot::Persist; @@ -78,67 +78,17 @@ pub enum DevicePersistError { Entropy(#[from] EntropyError), /// Resource misconfiguration: {0}. Is the snapshot file corrupted? ResourcesError(#[from] ResourcesError), + /// Could not activate device: {0} + DeviceActivation(#[from] ActivateError), } -/// Holds the state of a balloon device connected to the MMIO space. +/// Holds the state of a MMIO VirtIO device #[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ConnectedBalloonState { +pub struct VirtioDeviceState { /// Device identifier. pub device_id: String, /// Device state. - pub device_state: BalloonState, - /// Mmio transport state. - pub transport_state: MmioTransportState, - /// VmmResources. - pub device_info: MMIODeviceInfo, -} - -/// Holds the state of a virtio block device connected to the MMIO space. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ConnectedBlockState { - /// Device identifier. - pub device_id: String, - /// Device state. - pub device_state: BlockState, - /// Mmio transport state. - pub transport_state: MmioTransportState, - /// VmmResources. - pub device_info: MMIODeviceInfo, -} - -/// Holds the state of a net device connected to the MMIO space. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ConnectedNetState { - /// Device identifier. - pub device_id: String, - /// Device state. - pub device_state: NetState, - /// Mmio transport state. - pub transport_state: MmioTransportState, - /// VmmResources. - pub device_info: MMIODeviceInfo, -} - -/// Holds the state of a vsock device connected to the MMIO space. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ConnectedVsockState { - /// Device identifier. - pub device_id: String, - /// Device state. - pub device_state: VsockState, - /// Mmio transport state. - pub transport_state: MmioTransportState, - /// VmmResources. - pub device_info: MMIODeviceInfo, -} - -/// Holds the state of an entropy device connected to the MMIO space. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ConnectedEntropyState { - /// Device identifier. - pub device_id: String, - /// Device state. - pub device_state: EntropyState, + pub device_state: T, /// Mmio transport state. pub transport_state: MmioTransportState, /// VmmResources. @@ -187,17 +137,17 @@ pub struct DeviceStates { // State of legacy devices in MMIO space. pub legacy_devices: Vec, /// Block device states. - pub block_devices: Vec, + pub block_devices: Vec>, /// Net device states. - pub net_devices: Vec, + pub net_devices: Vec>, /// Vsock device state. - pub vsock_device: Option, + pub vsock_device: Option>, /// Balloon device state. - pub balloon_device: Option, + pub balloon_device: Option>, /// Mmds version. pub mmds_version: Option, /// Entropy device state. - pub entropy_device: Option, + pub entropy_device: Option>, } /// A type used to extract the concrete `Arc>` for each of the device @@ -311,20 +261,22 @@ impl<'a> Persist<'a> for MMIODeviceManager { let _: Result<(), ()> = self.for_each_virtio_device(|_, devid, device| { let mmio_transport_locked = device.inner.lock().expect("Poisoned lock"); let transport_state = mmio_transport_locked.save(); + let device_info = device.resources; + let device_id = devid.clone(); let mut locked_device = mmio_transport_locked.locked_device(); match locked_device.device_type() { TYPE_BALLOON => { - let balloon_state = locked_device + let device_state = locked_device .as_any() .downcast_ref::() .unwrap() .save(); - states.balloon_device = Some(ConnectedBalloonState { - device_id: devid.clone(), - device_state: balloon_state, + states.balloon_device = Some(VirtioDeviceState { + device_id, + device_state, transport_state, - device_info: device.resources, + device_info, }); } // Both virtio-block and vhost-user-block share same device type. @@ -337,16 +289,17 @@ impl<'a> Persist<'a> for MMIODeviceManager { ); } else { block.prepare_save(); - states.block_devices.push(ConnectedBlockState { - device_id: devid.clone(), - device_state: block.save(), + let device_state = block.save(); + states.block_devices.push(VirtioDeviceState { + device_id, + device_state, transport_state, - device_info: device.resources, - }) + device_info, + }); } } TYPE_NET => { - let net = locked_device.as_any().downcast_ref::().unwrap(); + let net = locked_device.as_mut_any().downcast_mut::().unwrap(); if let (Some(mmds_ns), None) = (net.mmds_ns.as_ref(), states.mmds_version.as_ref()) { @@ -354,11 +307,13 @@ impl<'a> Persist<'a> for MMIODeviceManager { Some(mmds_ns.mmds.lock().expect("Poisoned lock").version().into()); } - states.net_devices.push(ConnectedNetState { - device_id: devid.clone(), - device_state: net.save(), + net.prepare_save(); + let device_state = net.save(); + states.net_devices.push(VirtioDeviceState { + device_id, + device_state, transport_state, - device_info: device.resources, + device_info, }); } TYPE_VSOCK => { @@ -378,16 +333,16 @@ impl<'a> Persist<'a> for MMIODeviceManager { // Save state after potential notification to the guest. This // way we save changes to the queue the notification can cause. - let vsock_state = VsockState { + let device_state = VsockState { backend: vsock.backend().save(), frontend: vsock.save(), }; - states.vsock_device = Some(ConnectedVsockState { - device_id: devid.clone(), - device_state: vsock_state, + states.vsock_device = Some(VirtioDeviceState { + device_id, + device_state, transport_state, - device_info: device.resources, + device_info, }); } TYPE_RNG => { @@ -395,12 +350,13 @@ impl<'a> Persist<'a> for MMIODeviceManager { .as_mut_any() .downcast_mut::() .unwrap(); + let device_state = entropy.save(); - states.entropy_device = Some(ConnectedEntropyState { - device_id: devid.clone(), - device_state: entropy.save(), + states.entropy_device = Some(VirtioDeviceState { + device_id, + device_state, transport_state, - device_info: device.resources, + device_info, }); } _ => unreachable!(), @@ -450,19 +406,20 @@ impl<'a> Persist<'a> for MMIODeviceManager { } let mut restore_helper = |device: Arc>, + activated: bool, is_vhost_user: bool, as_subscriber: Arc>, id: &String, state: &MmioTransportState, - interrupt: Arc, device_info: &MMIODeviceInfo, mmio_bus: &vm_device::Bus, event_manager: &mut EventManager| -> Result<(), Self::Error> { + let interrupt = Arc::new(IrqTrigger::new()); let restore_args = MmioTransportConstructorArgs { mem: mem.clone(), - interrupt, - device, + interrupt: interrupt.clone(), + device: device.clone(), is_vhost_user, }; let mmio_transport = Arc::new(Mutex::new( @@ -480,16 +437,21 @@ impl<'a> Persist<'a> for MMIODeviceManager { }, )?; + if activated { + device + .lock() + .expect("Poisoned lock") + .activate(mem.clone(), interrupt)?; + } + event_manager.add_subscriber(as_subscriber); Ok(()) }; if let Some(balloon_state) = &state.balloon_device { - let interrupt = Arc::new(IrqTrigger::new()); let device = Arc::new(Mutex::new(Balloon::restore( BalloonConstructorArgs { mem: mem.clone(), - interrupt: interrupt.clone(), restored_from_file: constructor_args.restored_from_file, }, &balloon_state.device_state, @@ -501,11 +463,11 @@ impl<'a> Persist<'a> for MMIODeviceManager { restore_helper( device.clone(), + balloon_state.device_state.virtio_state.activated, false, device, &balloon_state.device_id, &balloon_state.transport_state, - interrupt, &balloon_state.device_info, &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, @@ -513,12 +475,8 @@ impl<'a> Persist<'a> for MMIODeviceManager { } for block_state in &state.block_devices { - let interrupt = Arc::new(IrqTrigger::new()); let device = Arc::new(Mutex::new(Block::restore( - BlockConstructorArgs { - mem: mem.clone(), - interrupt: interrupt.clone(), - }, + BlockConstructorArgs { mem: mem.clone() }, &block_state.device_state, )?)); @@ -528,11 +486,11 @@ impl<'a> Persist<'a> for MMIODeviceManager { restore_helper( device.clone(), + block_state.device_state.is_activated(), false, device, &block_state.device_id, &block_state.transport_state, - interrupt, &block_state.device_info, &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, @@ -556,11 +514,9 @@ impl<'a> Persist<'a> for MMIODeviceManager { } for net_state in &state.net_devices { - let interrupt = Arc::new(IrqTrigger::new()); let device = Arc::new(Mutex::new(Net::restore( NetConstructorArgs { mem: mem.clone(), - interrupt: interrupt.clone(), mmds: constructor_args .vm_resources .mmds @@ -577,11 +533,11 @@ impl<'a> Persist<'a> for MMIODeviceManager { restore_helper( device.clone(), + net_state.device_state.virtio_state.activated, false, device, &net_state.device_id, &net_state.transport_state, - interrupt, &net_state.device_info, &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, @@ -593,11 +549,9 @@ impl<'a> Persist<'a> for MMIODeviceManager { cid: vsock_state.device_state.frontend.cid, }; let backend = VsockUnixBackend::restore(ctor_args, &vsock_state.device_state.backend)?; - let interrupt = Arc::new(IrqTrigger::new()); let device = Arc::new(Mutex::new(Vsock::restore( VsockConstructorArgs { mem: mem.clone(), - interrupt: interrupt.clone(), backend, }, &vsock_state.device_state.frontend, @@ -609,11 +563,11 @@ impl<'a> Persist<'a> for MMIODeviceManager { restore_helper( device.clone(), + vsock_state.device_state.frontend.virtio_state.activated, false, device, &vsock_state.device_id, &vsock_state.transport_state, - interrupt, &vsock_state.device_info, &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, @@ -621,8 +575,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { } if let Some(entropy_state) = &state.entropy_device { - let interrupt = Arc::new(IrqTrigger::new()); - let ctor_args = EntropyConstructorArgs::new(mem.clone(), interrupt.clone()); + let ctor_args = EntropyConstructorArgs { mem: mem.clone() }; let device = Arc::new(Mutex::new(Entropy::restore( ctor_args, @@ -635,11 +588,11 @@ impl<'a> Persist<'a> for MMIODeviceManager { restore_helper( device.clone(), + entropy_state.device_state.virtio_state.activated, false, device, &entropy_state.device_id, &entropy_state.transport_state, - interrupt, &entropy_state.device_info, &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, @@ -665,29 +618,8 @@ mod tests { use crate::vmm_config::net::NetworkInterfaceConfig; use crate::vmm_config::vsock::VsockDeviceConfig; - impl PartialEq for ConnectedBalloonState { - fn eq(&self, other: &ConnectedBalloonState) -> bool { - // Actual device state equality is checked by the device's tests. - self.transport_state == other.transport_state && self.device_info == other.device_info - } - } - - impl PartialEq for ConnectedBlockState { - fn eq(&self, other: &ConnectedBlockState) -> bool { - // Actual device state equality is checked by the device's tests. - self.transport_state == other.transport_state && self.device_info == other.device_info - } - } - - impl PartialEq for ConnectedNetState { - fn eq(&self, other: &ConnectedNetState) -> bool { - // Actual device state equality is checked by the device's tests. - self.transport_state == other.transport_state && self.device_info == other.device_info - } - } - - impl PartialEq for ConnectedVsockState { - fn eq(&self, other: &ConnectedVsockState) -> bool { + impl PartialEq for VirtioDeviceState { + fn eq(&self, other: &VirtioDeviceState) -> bool { // Actual device state equality is checked by the device's tests. self.transport_state == other.transport_state && self.device_info == other.device_info } @@ -699,6 +631,7 @@ mod tests { && self.block_devices == other.block_devices && self.net_devices == other.net_devices && self.vsock_device == other.vsock_device + && self.entropy_device == other.entropy_device } } diff --git a/src/vmm/src/devices/virtio/balloon/persist.rs b/src/vmm/src/devices/virtio/balloon/persist.rs index a6634d07170..15ae1e26b9e 100644 --- a/src/vmm/src/devices/virtio/balloon/persist.rs +++ b/src/vmm/src/devices/virtio/balloon/persist.rs @@ -87,7 +87,7 @@ pub struct BalloonState { stats_desc_index: Option, latest_stats: BalloonStatsState, config_space: BalloonConfigSpaceState, - virtio_state: VirtioDeviceState, + pub virtio_state: VirtioDeviceState, } /// Auxiliary structure for creating a device when resuming from a snapshot. @@ -95,8 +95,6 @@ pub struct BalloonState { pub struct BalloonConstructorArgs { /// Pointer to guest memory. pub mem: GuestMemoryMmap, - /// Interrupt used from the device. - pub interrupt: Arc, pub restored_from_file: bool, } @@ -154,25 +152,18 @@ impl Persist<'_> for Balloon { actual_pages: state.config_space.actual_pages, }; - if state.virtio_state.activated { - balloon.device_state = DeviceState::Activated(ActiveState { - mem: constructor_args.mem, - interrupt: constructor_args.interrupt, - }); - - if balloon.stats_enabled() { - // Restore the stats descriptor. - balloon.set_stats_desc_index(state.stats_desc_index); - - // Restart timer if needed. - let timer_state = TimerState::Periodic { - current: Duration::from_secs(u64::from(state.stats_polling_interval_s)), - interval: Duration::from_secs(u64::from(state.stats_polling_interval_s)), - }; - balloon - .stats_timer - .set_state(timer_state, SetTimeFlags::Default); - } + if state.virtio_state.activated && balloon.stats_enabled() { + // Restore the stats descriptor. + balloon.set_stats_desc_index(state.stats_desc_index); + + // Restart timer if needed. + let timer_state = TimerState::Periodic { + current: Duration::from_secs(u64::from(state.stats_polling_interval_s)), + interval: Duration::from_secs(u64::from(state.stats_polling_interval_s)), + }; + balloon + .stats_timer + .set_state(timer_state, SetTimeFlags::Default); } Ok(balloon) @@ -202,7 +193,6 @@ mod tests { let restored_balloon = Balloon::restore( BalloonConstructorArgs { mem: guest_mem, - interrupt: default_interrupt(), restored_from_file: true, }, &Snapshot::deserialize(&mut mem.as_slice()).unwrap(), diff --git a/src/vmm/src/devices/virtio/block/persist.rs b/src/vmm/src/devices/virtio/block/persist.rs index 57712a8fb3a..cb9a6471137 100644 --- a/src/vmm/src/devices/virtio/block/persist.rs +++ b/src/vmm/src/devices/virtio/block/persist.rs @@ -17,9 +17,17 @@ pub enum BlockState { VhostUser(VhostUserBlockState), } +impl BlockState { + pub fn is_activated(&self) -> bool { + match self { + BlockState::Virtio(virtio_block_state) => virtio_block_state.virtio_state.activated, + BlockState::VhostUser(vhost_user_block_state) => false, + } + } +} + /// Auxiliary structure for creating a device when resuming from a snapshot. #[derive(Debug)] pub struct BlockConstructorArgs { pub mem: GuestMemoryMmap, - pub interrupt: Arc, } diff --git a/src/vmm/src/devices/virtio/block/virtio/persist.rs b/src/vmm/src/devices/virtio/block/virtio/persist.rs index 57e4a11b9c1..1c7a1bce106 100644 --- a/src/vmm/src/devices/virtio/block/virtio/persist.rs +++ b/src/vmm/src/devices/virtio/block/virtio/persist.rs @@ -58,7 +58,7 @@ pub struct VirtioBlockState { cache_type: CacheType, root_device: bool, disk_path: String, - virtio_state: VirtioDeviceState, + pub virtio_state: VirtioDeviceState, rate_limiter_state: RateLimiterState, file_engine_type: FileEngineTypeState, } @@ -111,15 +111,6 @@ impl Persist<'_> for VirtioBlock { let avail_features = state.virtio_state.avail_features; let acked_features = state.virtio_state.acked_features; - let device_state = if state.virtio_state.activated { - DeviceState::Activated(ActiveState { - mem: constructor_args.mem, - interrupt: constructor_args.interrupt, - }) - } else { - DeviceState::Inactive - }; - let config_space = ConfigSpace { capacity: disk_properties.nsectors.to_le(), }; @@ -132,7 +123,7 @@ impl Persist<'_> for VirtioBlock { queues, queue_evts, - device_state, + device_state: DeviceState::Inactive, id: state.id.clone(), partuuid: state.partuuid.clone(), @@ -227,10 +218,7 @@ mod tests { // Restore the block device. let restored_block = VirtioBlock::restore( - BlockConstructorArgs { - mem: guest_mem, - interrupt: default_interrupt(), - }, + BlockConstructorArgs { mem: guest_mem }, &Snapshot::deserialize(&mut mem.as_slice()).unwrap(), ) .unwrap(); diff --git a/src/vmm/src/devices/virtio/net/device.rs b/src/vmm/src/devices/virtio/net/device.rs index cf9f445d5df..4c6022a0067 100755 --- a/src/vmm/src/devices/virtio/net/device.rs +++ b/src/vmm/src/devices/virtio/net/device.rs @@ -8,6 +8,7 @@ use std::collections::VecDeque; use std::mem::{self}; use std::net::Ipv4Addr; +use std::num::Wrapping; use std::ops::Deref; use std::sync::{Arc, Mutex}; @@ -936,6 +937,26 @@ impl Net { Ok(()) } + + /// Prepare saving state + pub fn prepare_save(&mut self) { + // We shouldn't be messing with the queue if the device is not activated. + // Anyways, if it isn't there's nothing to prepare; we haven't parsed any + // descriptors yet from it and we can't have a deferred frame. + if !self.is_activated() { + return; + } + + // Give potential deferred RX frame to guest + self.rx_buffer.finish_frame(&mut self.queues[RX_INDEX]); + // Reset the parsed available descriptors, so we will re-parse them + self.queues[RX_INDEX].next_avail -= + Wrapping(u16::try_from(self.rx_buffer.parsed_descriptors.len()).unwrap()); + self.rx_buffer.parsed_descriptors.clear(); + self.rx_buffer.iovec.clear(); + self.rx_buffer.used_bytes = 0; + self.rx_buffer.used_descriptors = 0; + } } impl VirtioDevice for Net { diff --git a/src/vmm/src/devices/virtio/net/persist.rs b/src/vmm/src/devices/virtio/net/persist.rs index 5ebd15f9d54..6ef8ad842ac 100644 --- a/src/vmm/src/devices/virtio/net/persist.rs +++ b/src/vmm/src/devices/virtio/net/persist.rs @@ -30,27 +30,6 @@ pub struct NetConfigSpaceState { guest_mac: Option, } -/// Information about the parsed RX buffers -#[derive(Debug, Default, Clone, Serialize, Deserialize)] -pub struct RxBufferState { - // Number of iovecs we have parsed from the guest - parsed_descriptor_chains_nr: u16, - // Number of used descriptors - used_descriptors: u16, - // Number of used bytes - used_bytes: u32, -} - -impl RxBufferState { - fn from_rx_buffers(rx_buffer: &RxBuffers) -> Self { - RxBufferState { - parsed_descriptor_chains_nr: rx_buffer.parsed_descriptors.len().try_into().unwrap(), - used_descriptors: rx_buffer.used_descriptors, - used_bytes: rx_buffer.used_bytes, - } - } -} - /// Information about the network device that are saved /// at snapshot. #[derive(Debug, Clone, Serialize, Deserialize)] @@ -62,8 +41,7 @@ pub struct NetState { /// The associated MMDS network stack. pub mmds_ns: Option, config_space: NetConfigSpaceState, - virtio_state: VirtioDeviceState, - rx_buffers_state: RxBufferState, + pub virtio_state: VirtioDeviceState, } /// Auxiliary structure for creating a device when resuming from a snapshot. @@ -71,8 +49,6 @@ pub struct NetState { pub struct NetConstructorArgs { /// Pointer to guest memory. pub mem: GuestMemoryMmap, - /// Interrupt for the device. - pub interrupt: Arc, /// Pointer to the MMDS data store. pub mmds: Option>>, } @@ -108,7 +84,6 @@ impl Persist<'_> for Net { guest_mac: self.guest_mac, }, virtio_state: VirtioDeviceState::from_device(self), - rx_buffers_state: RxBufferState::from_rx_buffers(&self.rx_buffer), } } @@ -153,26 +128,6 @@ impl Persist<'_> for Net { net.avail_features = state.virtio_state.avail_features; net.acked_features = state.virtio_state.acked_features; - if state.virtio_state.activated { - let supported_flags: u32 = Net::build_tap_offload_features(net.acked_features); - net.tap - .set_offload(supported_flags) - .map_err(NetPersistError::TapSetOffload)?; - - net.device_state = DeviceState::Activated(ActiveState { - mem: constructor_args.mem, - interrupt: constructor_args.interrupt, - }); - - // Recreate `Net::rx_buffer`. We do it by re-parsing the RX queue. We're temporarily - // rolling back `next_avail` in the RX queue and call `parse_rx_descriptors`. - net.queues[RX_INDEX].next_avail -= state.rx_buffers_state.parsed_descriptor_chains_nr; - net.parse_rx_descriptors() - .map_err(|e| NetPersistError::VirtioState(VirtioStateError::InvalidAvailIdx(e)))?; - net.rx_buffer.used_descriptors = state.rx_buffers_state.used_descriptors; - net.rx_buffer.used_bytes = state.rx_buffers_state.used_bytes; - } - Ok(net) } } @@ -216,7 +171,6 @@ mod tests { match Net::restore( NetConstructorArgs { mem: guest_mem, - interrupt: default_interrupt(), mmds: mmds_ds, }, &Snapshot::deserialize(&mut mem.as_slice()).unwrap(), diff --git a/src/vmm/src/devices/virtio/rng/persist.rs b/src/vmm/src/devices/virtio/rng/persist.rs index 75db947c9c7..d266e259418 100644 --- a/src/vmm/src/devices/virtio/rng/persist.rs +++ b/src/vmm/src/devices/virtio/rng/persist.rs @@ -19,20 +19,13 @@ use crate::vstate::memory::GuestMemoryMmap; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct EntropyState { - virtio_state: VirtioDeviceState, + pub virtio_state: VirtioDeviceState, rate_limiter_state: RateLimiterState, } #[derive(Debug)] pub struct EntropyConstructorArgs { - mem: GuestMemoryMmap, - interrupt: Arc, -} - -impl EntropyConstructorArgs { - pub fn new(mem: GuestMemoryMmap, interrupt: Arc) -> Self { - Self { mem, interrupt } - } + pub mem: GuestMemoryMmap, } #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -72,9 +65,6 @@ impl Persist<'_> for Entropy { let mut entropy = Entropy::new_with_queues(queues, rate_limiter)?; entropy.set_avail_features(state.virtio_state.avail_features); entropy.set_acked_features(state.virtio_state.acked_features); - if state.virtio_state.activated { - entropy.set_activated(constructor_args.mem, constructor_args.interrupt); - } Ok(entropy) } @@ -99,7 +89,7 @@ mod tests { let guest_mem = create_virtio_mem(); let restored = Entropy::restore( - EntropyConstructorArgs::new(guest_mem, default_interrupt()), + EntropyConstructorArgs { mem: guest_mem }, &Snapshot::deserialize(&mut mem.as_slice()).unwrap(), ) .unwrap(); diff --git a/src/vmm/src/devices/virtio/vsock/persist.rs b/src/vmm/src/devices/virtio/vsock/persist.rs index 9d2fd61d9d5..6775707da3e 100644 --- a/src/vmm/src/devices/virtio/vsock/persist.rs +++ b/src/vmm/src/devices/virtio/vsock/persist.rs @@ -31,7 +31,7 @@ pub struct VsockState { pub struct VsockFrontendState { /// Context Identifier. pub cid: u64, - virtio_state: VirtioDeviceState, + pub virtio_state: VirtioDeviceState, } /// An enum for the serializable backend state types. @@ -53,8 +53,6 @@ pub struct VsockUdsState { pub struct VsockConstructorArgs { /// Pointer to guest memory. pub mem: GuestMemoryMmap, - /// Interrupt to use for the device. - pub interrupt: Arc, /// The vsock Unix Backend. pub backend: B, } @@ -123,14 +121,7 @@ where vsock.acked_features = state.virtio_state.acked_features; vsock.avail_features = state.virtio_state.avail_features; - vsock.device_state = if state.virtio_state.activated { - DeviceState::Activated(ActiveState { - mem: constructor_args.mem, - interrupt: constructor_args.interrupt, - }) - } else { - DeviceState::Inactive - }; + vsock.device_state = DeviceState::Inactive; Ok(vsock) } } @@ -193,7 +184,6 @@ pub(crate) mod tests { let mut restored_device = Vsock::restore( VsockConstructorArgs { mem: ctx.mem.clone(), - interrupt: default_interrupt(), backend: match restored_state.backend { VsockBackendState::Uds(uds_state) => { assert_eq!(uds_state.path, "test".to_owned()); diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index b3efc12a500..d4cb5a78344 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -390,7 +390,7 @@ impl Vmm { /// Sends a resume command to the vCPUs. pub fn resume_vm(&mut self) -> Result<(), VmmError> { - self.device_manager.mmio_devices.kick_devices(); + self.device_manager.kick_virtio_devices(); // Send the events. self.vcpus_handles diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index 479a8d75e03..2c9b1a0eea6 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -166,21 +166,8 @@ pub fn create_snapshot( // We need to mark queues as dirty again for all activated devices. The reason we // do it here is that we don't mark pages as dirty during runtime // for queue objects. - // SAFETY: - // This should never fail as we only mark pages only if device has already been activated, - // and the address validation was already performed on device activation. vmm.device_manager - .mmio_devices - .for_each_virtio_device(|_, _, device| { - let mmio_dev_locked = device.inner.lock().expect("Poisoned lock"); - let mut d = mmio_dev_locked.locked_device(); - if d.is_activated() { - d.mark_queue_memory_dirty(vmm.vm.guest_memory()) - } else { - Ok(()) - } - }) - .unwrap(); + .mark_virtio_queue_memory_dirty(vmm.vm.guest_memory()); Ok(()) } From effb904d5276490127c0d0592afcd53ae3b8d2b6 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Thu, 5 Jun 2025 11:39:21 +0200 Subject: [PATCH 47/56] pci: support snapshotting VirtIO PCI devices Support serializing the device-specific and transport state of a VirtIO device that uses the PCI transport. Signed-off-by: Babis Chalios --- src/vmm/src/builder.rs | 4 +- src/vmm/src/device_manager/mod.rs | 19 +- src/vmm/src/device_manager/pci_mngr.rs | 421 +++++++++++++++++- .../virtio/transport/pci/common_config.rs | 2 +- .../devices/virtio/transport/pci/device.rs | 173 ++++--- 5 files changed, 551 insertions(+), 68 deletions(-) diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 3284b11f559..d8d69a97314 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -451,6 +451,8 @@ pub fn build_microvm_from_snapshot( // Restore the boot source config paths. vm_resources.boot_source.config = microvm_state.vm_info.boot_source; + let vm = Arc::new(vm); + // Restore devices states. // Restoring VMGenID injects an interrupt in the guest to notify it about the new generation // ID. As a result, we need to restore DeviceManager after restoring the KVM state, otherwise @@ -473,7 +475,7 @@ pub fn build_microvm_from_snapshot( instance_info: instance_info.clone(), shutdown_exit_code: None, kvm, - vm: Arc::new(vm), + vm, uffd, vcpus_handles: Vec::new(), vcpus_exit_evt, diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index 95e04111b13..260f3337673 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -357,6 +357,11 @@ impl DeviceManager { Self::do_kick_device(mmio_transport_locked.device()); Ok(()) }); + // Go through PCI VirtIO devices + for device in self.pci_devices.virtio_devices.values() { + let virtio_device = device.lock().expect("Poisoned lock").virtio_device(); + Self::do_kick_device(virtio_device); + } } fn do_mark_virtio_queue_memory_dirty( @@ -380,6 +385,12 @@ impl DeviceManager { Self::do_mark_virtio_queue_memory_dirty(mmio_transport_locked.device(), mem); Ok(()) }); + + // Go through PCI VirtIO devices + for device in self.pci_devices.virtio_devices.values() { + let virtio_device = device.lock().expect("Poisoned lock").virtio_device(); + Self::do_mark_virtio_queue_memory_dirty(virtio_device, mem); + } } } @@ -416,7 +427,7 @@ pub enum DevicePersistError { pub struct DeviceRestoreArgs<'a> { pub mem: &'a GuestMemoryMmap, - pub vm: &'a Vm, + pub vm: &'a Arc, pub event_manager: &'a mut EventManager, pub vcpus_exit_evt: &'a EventFd, pub vm_resources: &'a mut VmResources, @@ -491,6 +502,12 @@ impl<'a> Persist<'a> for DeviceManager { // Restore PCI devices let pci_ctor_args = PciDevicesConstructorArgs { resource_allocator: &resource_allocator, + vm: constructor_args.vm.clone(), + mem: constructor_args.mem, + vm_resources: constructor_args.vm_resources, + instance_id: constructor_args.instance_id, + restored_from_file: constructor_args.restored_from_file, + event_manager: constructor_args.event_manager, }; let pci_devices = PciDevices::restore(pci_ctor_args, &state.pci_state)?; diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs index 70bb03388f6..26a44dd29c9 100644 --- a/src/vmm/src/device_manager/pci_mngr.rs +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -5,19 +5,37 @@ use std::collections::HashMap; use std::fmt::Debug; use std::sync::{Arc, Mutex}; -use event_manager::MutEventSubscriber; -use log::debug; +use event_manager::{MutEventSubscriber, SubscriberOps}; +use log::{debug, error, warn}; use pci::{PciBarRegionType, PciDevice, PciDeviceError, PciRootError}; use serde::{Deserialize, Serialize}; use vm_device::BusError; -use crate::Vm; +use super::persist::{MmdsVersionState, SharedDeviceType}; use crate::device_manager::resources::ResourceAllocator; use crate::devices::pci::PciSegment; +use crate::devices::virtio::balloon::Balloon; +use crate::devices::virtio::balloon::persist::{BalloonConstructorArgs, BalloonState}; +use crate::devices::virtio::block::device::Block; +use crate::devices::virtio::block::persist::{BlockConstructorArgs, BlockState}; use crate::devices::virtio::device::VirtioDevice; -use crate::devices::virtio::transport::pci::device::{VirtioPciDevice, VirtioPciDeviceError}; +use crate::devices::virtio::net::Net; +use crate::devices::virtio::net::persist::{NetConstructorArgs, NetState}; +use crate::devices::virtio::rng::Entropy; +use crate::devices::virtio::rng::persist::{EntropyConstructorArgs, EntropyState}; +use crate::devices::virtio::transport::pci::device::{ + VirtioPciDevice, VirtioPciDeviceError, VirtioPciDeviceState, +}; +use crate::devices::virtio::vsock::persist::{ + VsockConstructorArgs, VsockState, VsockUdsConstructorArgs, +}; +use crate::devices::virtio::vsock::{TYPE_VSOCK, Vsock, VsockUnixBackend}; +use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET, TYPE_RNG}; +use crate::resources::VmResources; use crate::snapshot::Persist; -use crate::vstate::vm::InterruptError; +use crate::vstate::memory::GuestMemoryMmap; +use crate::vstate::vm::{InterruptError, MsiVectorGroup}; +use crate::{EventManager, Vm}; #[derive(Debug, Default)] pub struct PciDevices { @@ -119,6 +137,7 @@ impl PciDevices { let pci_device_bdf = pci_segment.next_device_bdf()?; debug!("Allocating BDF: {pci_device_bdf:?} for device"); let mem = vm.guest_memory().clone(); + let device_type: u32 = device.lock().expect("Poisoned lock").device_type(); // Allocate one MSI vector per queue, plus one for configuration let msix_num = @@ -153,6 +172,9 @@ impl PciDevices { .expect("Poisoned lock") .add_device(pci_device_bdf.device() as u32, virtio_device.clone())?; + self.virtio_devices + .insert((device_type, id.clone()), virtio_device.clone()); + Self::register_bars_with_bus(resource_allocator, &virtio_device)?; virtio_device .lock() @@ -162,6 +184,54 @@ impl PciDevices { Ok(()) } + fn restore_pci_device( + &mut self, + vm: &Arc, + resource_allocator: &ResourceAllocator, + device: Arc>, + device_id: &str, + transport_state: &VirtioPciDeviceState, + event_manager: &mut EventManager, + ) -> Result<(), PciManagerError> { + // We should only be reaching this point if PCI is enabled + let pci_segment = self.pci_segment.as_ref().unwrap(); + let msi_vector_group = Arc::new(MsiVectorGroup::restore( + vm.clone(), + &transport_state.msi_vector_group, + )?); + let device_type: u32 = device.lock().expect("Poisoned lock").device_type(); + + let virtio_device = Arc::new(Mutex::new(VirtioPciDevice::new_from_state( + device_id.to_string(), + vm.guest_memory().clone(), + device.clone(), + msi_vector_group, + transport_state.clone(), + )?)); + + pci_segment + .pci_bus + .lock() + .expect("Poisoned lock") + .add_device( + transport_state.pci_device_bdf.device() as u32, + virtio_device.clone(), + )?; + + self.virtio_devices + .insert((device_type, device_id.to_string()), virtio_device.clone()); + + Self::register_bars_with_bus(resource_allocator, &virtio_device)?; + virtio_device + .lock() + .expect("Poisoned lock") + .register_notification_ioevent(vm)?; + + event_manager.add_subscriber(device); + + Ok(()) + } + /// Gets the specified device. pub fn get_virtio_device( &self, @@ -173,14 +243,57 @@ impl PciDevices { } } +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct VirtioDeviceState { + /// Device identifier + pub device_id: String, + /// Device BDF + pub pci_device_bdf: u32, + /// Device state + pub device_state: T, + /// Transport state + pub transport_state: VirtioPciDeviceState, +} + #[derive(Default, Debug, Clone, Serialize, Deserialize)] pub struct PciDevicesState { - pci_enabled: bool, + /// Whether PCI is enabled + pub pci_enabled: bool, + /// Block device states. + pub block_devices: Vec>, + /// Net device states. + pub net_devices: Vec>, + /// Vsock device state. + pub vsock_device: Option>, + /// Balloon device state. + pub balloon_device: Option>, + /// Mmds version. + pub mmds_version: Option, + /// Entropy device state. + pub entropy_device: Option>, } -#[derive(Debug)] pub struct PciDevicesConstructorArgs<'a> { + pub vm: Arc, + pub mem: &'a GuestMemoryMmap, pub resource_allocator: &'a Arc, + pub vm_resources: &'a mut VmResources, + pub instance_id: &'a str, + pub restored_from_file: bool, + pub event_manager: &'a mut EventManager, +} + +impl<'a> Debug for PciDevicesConstructorArgs<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("PciDevicesConstructorArgs") + .field("vm", &self.vm) + .field("mem", &self.mem) + .field("resource_allocator", &self.resource_allocator) + .field("vm_resources", &self.vm_resources) + .field("instance_id", &self.instance_id) + .field("restored_from_file", &self.restored_from_file) + .finish() + } } impl<'a> Persist<'a> for PciDevices { @@ -189,19 +302,305 @@ impl<'a> Persist<'a> for PciDevices { type Error = PciManagerError; fn save(&self) -> Self::State { - PciDevicesState { - pci_enabled: self.pci_segment.is_some(), + let mut state = PciDevicesState::default(); + if self.pci_segment.is_some() { + state.pci_enabled = true; + } else { + return state; } + + for pci_dev in self.virtio_devices.values() { + let locked_pci_dev = pci_dev.lock().expect("Poisoned lock"); + let transport_state = locked_pci_dev.state(); + let virtio_dev = locked_pci_dev.virtio_device(); + let mut locked_virtio_dev = virtio_dev.lock().expect("Poisoned lock"); + + let pci_device_bdf = transport_state.pci_device_bdf.into(); + + match locked_virtio_dev.device_type() { + TYPE_BALLOON => { + let balloon_device = locked_virtio_dev + .as_any() + .downcast_ref::() + .unwrap(); + + let device_state = balloon_device.save(); + + state.balloon_device = Some(VirtioDeviceState { + device_id: balloon_device.id().to_string(), + pci_device_bdf, + device_state, + transport_state, + }); + } + TYPE_BLOCK => { + let block_dev = locked_virtio_dev + .as_mut_any() + .downcast_mut::() + .unwrap(); + if block_dev.is_vhost_user() { + warn!( + "Skipping vhost-user-block device. VhostUserBlock does not support \ + snapshotting yet" + ); + } else { + block_dev.prepare_save(); + let device_state = block_dev.save(); + state.block_devices.push(VirtioDeviceState { + device_id: block_dev.id().to_string(), + pci_device_bdf, + device_state, + transport_state, + }); + } + } + TYPE_NET => { + let net_dev = locked_virtio_dev + .as_mut_any() + .downcast_mut::() + .unwrap(); + if let (Some(mmds_ns), None) = + (net_dev.mmds_ns.as_ref(), state.mmds_version.as_ref()) + { + state.mmds_version = + Some(mmds_ns.mmds.lock().expect("Poisoned lock").version().into()); + } + net_dev.prepare_save(); + let device_state = net_dev.save(); + + state.net_devices.push(VirtioDeviceState { + device_id: net_dev.id().to_string(), + pci_device_bdf, + device_state, + transport_state, + }) + } + TYPE_VSOCK => { + let vsock_dev = locked_virtio_dev + .as_mut_any() + // Currently, VsockUnixBackend is the only implementation of VsockBackend. + .downcast_mut::>() + .unwrap(); + + // Send Transport event to reset connections if device + // is activated. + if vsock_dev.is_activated() { + vsock_dev + .send_transport_reset_event() + .unwrap_or_else(|err| { + error!("Failed to send reset transport event: {:?}", err); + }); + } + + // Save state after potential notification to the guest. This + // way we save changes to the queue the notification can cause. + let vsock_state = VsockState { + backend: vsock_dev.backend().save(), + frontend: vsock_dev.save(), + }; + + state.vsock_device = Some(VirtioDeviceState { + device_id: vsock_dev.id().to_string(), + pci_device_bdf, + device_state: vsock_state, + transport_state, + }); + } + TYPE_RNG => { + let rng_dev = locked_virtio_dev + .as_mut_any() + .downcast_mut::() + .unwrap(); + let device_state = rng_dev.save(); + + state.entropy_device = Some(VirtioDeviceState { + device_id: rng_dev.id().to_string(), + pci_device_bdf, + device_state, + transport_state, + }) + } + _ => unreachable!(), + } + } + + state } fn restore( constructor_args: Self::ConstructorArgs, state: &Self::State, ) -> std::result::Result { + let mem = constructor_args.mem; let mut pci_devices = PciDevices::new(); + if !state.pci_enabled { + return Ok(pci_devices); + } + + pci_devices.attach_pci_segment(constructor_args.resource_allocator)?; + + if let Some(balloon_state) = &state.balloon_device { + let device = Arc::new(Mutex::new( + Balloon::restore( + BalloonConstructorArgs { + mem: mem.clone(), + restored_from_file: constructor_args.restored_from_file, + }, + &balloon_state.device_state, + ) + .unwrap(), + )); + + constructor_args + .vm_resources + .update_from_restored_device(SharedDeviceType::Balloon(device.clone())) + .unwrap(); + + pci_devices + .restore_pci_device( + &constructor_args.vm, + constructor_args.resource_allocator, + device, + &balloon_state.device_id, + &balloon_state.transport_state, + constructor_args.event_manager, + ) + .unwrap() + } + + for block_state in &state.block_devices { + let device = Arc::new(Mutex::new( + Block::restore( + BlockConstructorArgs { mem: mem.clone() }, + &block_state.device_state, + ) + .unwrap(), + )); + + constructor_args + .vm_resources + .update_from_restored_device(SharedDeviceType::VirtioBlock(device.clone())) + .unwrap(); + + pci_devices + .restore_pci_device( + &constructor_args.vm, + constructor_args.resource_allocator, + device, + &block_state.device_id, + &block_state.transport_state, + constructor_args.event_manager, + ) + .unwrap() + } + + // If the snapshot has the mmds version persisted, initialise the data store with it. + if let Some(mmds_version) = &state.mmds_version { + constructor_args + .vm_resources + .set_mmds_version(mmds_version.clone().into(), constructor_args.instance_id) + .unwrap(); + } else if state + .net_devices + .iter() + .any(|dev| dev.device_state.mmds_ns.is_some()) + { + // If there's at least one network device having an mmds_ns, it means + // that we are restoring from a version that did not persist the `MmdsVersionState`. + // Init with the default. + constructor_args.vm_resources.mmds_or_default(); + } + + for net_state in &state.net_devices { + let device = Arc::new(Mutex::new( + Net::restore( + NetConstructorArgs { + mem: mem.clone(), + mmds: constructor_args + .vm_resources + .mmds + .as_ref() + // Clone the Arc reference. + .cloned(), + }, + &net_state.device_state, + ) + .unwrap(), + )); + + constructor_args + .vm_resources + .update_from_restored_device(SharedDeviceType::Network(device.clone())) + .unwrap(); + + pci_devices + .restore_pci_device( + &constructor_args.vm, + constructor_args.resource_allocator, + device, + &net_state.device_id, + &net_state.transport_state, + constructor_args.event_manager, + ) + .unwrap() + } + + if let Some(vsock_state) = &state.vsock_device { + let ctor_args = VsockUdsConstructorArgs { + cid: vsock_state.device_state.frontend.cid, + }; + let backend = + VsockUnixBackend::restore(ctor_args, &vsock_state.device_state.backend).unwrap(); + let device = Arc::new(Mutex::new( + Vsock::restore( + VsockConstructorArgs { + mem: mem.clone(), + backend, + }, + &vsock_state.device_state.frontend, + ) + .unwrap(), + )); + + constructor_args + .vm_resources + .update_from_restored_device(SharedDeviceType::Vsock(device.clone())) + .unwrap(); + + pci_devices + .restore_pci_device( + &constructor_args.vm, + constructor_args.resource_allocator, + device, + &vsock_state.device_id, + &vsock_state.transport_state, + constructor_args.event_manager, + ) + .unwrap() + } + + if let Some(entropy_state) = &state.entropy_device { + let ctor_args = EntropyConstructorArgs { mem: mem.clone() }; + + let device = Arc::new(Mutex::new( + Entropy::restore(ctor_args, &entropy_state.device_state).unwrap(), + )); + + constructor_args + .vm_resources + .update_from_restored_device(SharedDeviceType::Entropy(device.clone())) + .unwrap(); - if state.pci_enabled { - pci_devices.attach_pci_segment(constructor_args.resource_allocator)?; + pci_devices + .restore_pci_device( + &constructor_args.vm, + constructor_args.resource_allocator, + device, + &entropy_state.device_id, + &entropy_state.transport_state, + constructor_args.event_manager, + ) + .unwrap() } Ok(pci_devices) diff --git a/src/vmm/src/devices/virtio/transport/pci/common_config.rs b/src/vmm/src/devices/virtio/transport/pci/common_config.rs index c8ee2d1d2a9..6e52a1ca007 100644 --- a/src/vmm/src/devices/virtio/transport/pci/common_config.rs +++ b/src/vmm/src/devices/virtio/transport/pci/common_config.rs @@ -144,7 +144,7 @@ impl VirtioPciCommonConfig { } } - fn state(&self) -> VirtioPciCommonConfigState { + pub fn state(&self) -> VirtioPciCommonConfigState { VirtioPciCommonConfigState { driver_status: self.driver_status, config_generation: self.config_generation, diff --git a/src/vmm/src/devices/virtio/transport/pci/device.rs b/src/vmm/src/devices/virtio/transport/pci/device.rs index 20c169297fd..6793d502f00 100644 --- a/src/vmm/src/devices/virtio/transport/pci/device.rs +++ b/src/vmm/src/devices/virtio/transport/pci/device.rs @@ -9,6 +9,7 @@ use std::any::Any; use std::cmp; +use std::collections::HashMap; use std::fmt::{Debug, Formatter}; use std::io::Write; use std::sync::atomic::{AtomicBool, AtomicU16, AtomicU32, AtomicUsize, Ordering}; @@ -41,6 +42,7 @@ use crate::devices::virtio::transport::pci::common_config::{ use crate::devices::virtio::transport::{VirtioInterrupt, VirtioInterruptType}; use crate::devices::virtio::{TYPE_BLOCK, TYPE_NET}; use crate::logger::{debug, error}; +use crate::snapshot::Persist; use crate::utils::u64_to_usize; use crate::vstate::memory::GuestMemoryMmap; use crate::vstate::vm::{InterruptError, MsiVectorGroup}; @@ -283,8 +285,8 @@ const NOTIFY_OFF_MULTIPLIER: u32 = 4; // A dword per notification address. const VIRTIO_PCI_VENDOR_ID: u16 = 0x1af4; const VIRTIO_PCI_DEVICE_ID_BASE: u16 = 0x1040; // Add to device type to get device ID. -#[derive(Debug, Serialize, Deserialize)] -struct QueueState { +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct QueueState { max_size: u16, size: u16, ready: bool, @@ -293,14 +295,18 @@ struct QueueState { used_ring: u64, } -#[derive(Debug, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct VirtioPciDeviceState { pub pci_device_bdf: PciBdf, - device_activated: bool, - queues: Vec, - interrupt_status: usize, - cap_pci_cfg_offset: usize, - cap_pci_cfg: Vec, + pub device_activated: bool, + pub interrupt_status: usize, + pub cap_pci_cfg_offset: usize, + pub cap_pci_cfg: Vec, + pub pci_configuration_state: PciConfigurationState, + pub pci_dev_state: VirtioPciCommonConfigState, + pub msix_state: MsixConfigState, + pub msi_vector_group: HashMap, + pub bar_configuration: Vec, } #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -337,7 +343,7 @@ pub struct VirtioPciDevice { // PCI interrupts. interrupt_status: Arc, virtio_interrupt: Option>, - interrupt_source_group: Arc, + interrupt_source_group: Arc, // Guest memory memory: GuestMemoryMmap, @@ -421,7 +427,6 @@ impl VirtioPciDevice { } /// Constructs a new PCI transport for the given virtio device. - #[allow(clippy::too_many_arguments)] pub fn new( id: String, memory: GuestMemoryMmap, @@ -464,7 +469,7 @@ impl VirtioPciDevice { device, device_activated: Arc::new(AtomicBool::new(false)), interrupt_status: Arc::new(AtomicUsize::new(0)), - virtio_interrupt: None, + virtio_interrupt: Some(interrupt), memory, settings_bar: 0, use_64bit_bar: true, @@ -476,6 +481,70 @@ impl VirtioPciDevice { Ok(virtio_pci_device) } + pub fn new_from_state( + id: String, + memory: GuestMemoryMmap, + device: Arc>, + msi_vectors: Arc, + state: VirtioPciDeviceState, + ) -> Result { + let msix_config = Self::msix_config( + state.pci_device_bdf.into(), + msi_vectors.clone(), + Some(state.msix_state), + )?; + + let pci_config = Self::pci_configuration( + device.lock().expect("Poisoned lock").device_type(), + &msix_config, + Some(state.pci_configuration_state), + ); + let virtio_common_config = VirtioPciCommonConfig::new(state.pci_dev_state); + let cap_pci_cfg_info = VirtioPciCfgCapInfo { + offset: state.cap_pci_cfg_offset, + cap: *VirtioPciCfgCap::from_slice(&state.cap_pci_cfg).unwrap(), + }; + + let interrupt = Arc::new(VirtioInterruptMsix::new( + msix_config.clone(), + virtio_common_config.msix_config.clone(), + virtio_common_config.msix_queues.clone(), + msi_vectors.clone(), + )); + + let virtio_pci_device = VirtioPciDevice { + id, + pci_device_bdf: state.pci_device_bdf, + configuration: pci_config, + common_config: virtio_common_config, + msix_config: Some(msix_config), + msix_num: msi_vectors.num_vectors(), + device, + device_activated: Arc::new(AtomicBool::new(state.device_activated)), + interrupt_status: Arc::new(AtomicUsize::new(state.interrupt_status)), + virtio_interrupt: Some(interrupt), + memory: memory.clone(), + settings_bar: 0, + use_64bit_bar: true, + interrupt_source_group: msi_vectors, + cap_pci_cfg_info, + bar_regions: state.bar_configuration, + }; + + if state.device_activated { + virtio_pci_device + .device + .lock() + .expect("Poisoned lock") + .activate( + memory, + virtio_pci_device.virtio_interrupt.as_ref().unwrap().clone(), + ); + } + + Ok(virtio_pci_device) + } + fn is_driver_ready(&self) -> bool { let ready_bits = (DEVICE_ACKNOWLEDGE | DEVICE_DRIVER | DEVICE_DRIVER_OK | DEVICE_FEATURES_OK); @@ -657,6 +726,27 @@ impl VirtioPciDevice { } Ok(()) } + + pub fn state(&self) -> VirtioPciDeviceState { + VirtioPciDeviceState { + pci_device_bdf: self.pci_device_bdf, + device_activated: self.device_activated.load(Ordering::Acquire), + interrupt_status: self.interrupt_status.load(Ordering::Acquire), + cap_pci_cfg_offset: self.cap_pci_cfg_info.offset, + cap_pci_cfg: self.cap_pci_cfg_info.cap.bytes().to_vec(), + pci_configuration_state: self.configuration.state(), + pci_dev_state: self.common_config.state(), + msix_state: self + .msix_config + .as_ref() + .unwrap() + .lock() + .expect("Poisoned lock") + .state(), + msi_vector_group: self.interrupt_source_group.save(), + bar_configuration: self.bar_regions.clone(), + } + } } pub struct VirtioInterruptMsix { @@ -796,57 +886,33 @@ impl PciDevice for VirtioPciDevice { &mut self, mmio32_allocator: &mut AddressAllocator, mmio64_allocator: &mut AddressAllocator, - resources: Option>, + _resources: Option>, ) -> std::result::Result, PciDeviceError> { let mut bars = Vec::new(); let device_clone = self.device.clone(); let device = device_clone.lock().unwrap(); - let mut settings_bar_addr = None; - let mut use_64bit_bar = self.use_64bit_bar; - let restoring = resources.is_some(); - if let Some(resources) = resources { - for resource in resources { - if let Resource::PciBar { - index, base, type_, .. - } = resource - { - if index == VIRTIO_COMMON_BAR_INDEX { - settings_bar_addr = Some(GuestAddress(base)); - use_64bit_bar = match type_ { - PciBarType::Io => { - return Err(PciDeviceError::InvalidResource(resource)); - } - PciBarType::Mmio32 => false, - PciBarType::Mmio64 => true, - }; - break; - } - } - } - // Error out if no resource was matching the BAR id. - if settings_bar_addr.is_none() { - return Err(PciDeviceError::MissingResource); - } - } - // Allocate the virtio-pci capability BAR. // See http://docs.oasis-open.org/virtio/virtio/v1.0/cs04/virtio-v1.0-cs04.html#x1-740004 - let policy = match settings_bar_addr { - Some(addr) => AllocPolicy::ExactMatch(addr.0), - None => AllocPolicy::FirstMatch, - }; - let (virtio_pci_bar_addr, region_type) = if use_64bit_bar { + let (virtio_pci_bar_addr, region_type) = if self.use_64bit_bar { let region_type = PciBarRegionType::Memory64BitRegion; let addr = mmio64_allocator - .allocate(CAPABILITY_BAR_SIZE, CAPABILITY_BAR_SIZE, policy) + .allocate( + CAPABILITY_BAR_SIZE, + CAPABILITY_BAR_SIZE, + AllocPolicy::FirstMatch, + ) .unwrap() .start(); (addr, region_type) } else { let region_type = PciBarRegionType::Memory32BitRegion; let addr = mmio32_allocator - .allocate(CAPABILITY_BAR_SIZE, CAPABILITY_BAR_SIZE, policy) + .allocate( + CAPABILITY_BAR_SIZE, + CAPABILITY_BAR_SIZE, + AllocPolicy::FirstMatch, + ) .unwrap() .start(); (addr, region_type) @@ -862,14 +928,12 @@ impl PciDevice for VirtioPciDevice { // happen only during the creation of a brand new VM. When a VM is // restored from a known state, the BARs are already created with the // right content, therefore we don't need to go through this codepath. - if !restoring { - self.configuration - .add_pci_bar(&bar) - .map_err(|e| PciDeviceError::IoRegistrationFailed(virtio_pci_bar_addr, e))?; + self.configuration + .add_pci_bar(&bar) + .map_err(|e| PciDeviceError::IoRegistrationFailed(virtio_pci_bar_addr, e))?; - // Once the BARs are allocated, the capabilities can be added to the PCI configuration. - self.add_pci_capabilities(VIRTIO_COMMON_BAR_INDEX.try_into().unwrap())?; - } + // Once the BARs are allocated, the capabilities can be added to the PCI configuration. + self.add_pci_capabilities(VIRTIO_COMMON_BAR_INDEX.try_into().unwrap())?; bars.push(bar); @@ -1015,6 +1079,7 @@ impl PciDevice for VirtioPciDevice { Arc::clone(self.virtio_interrupt.as_ref().unwrap()), ) .unwrap_or_else(|err| error!("Error activating device: {err:?}")); + self.device_activated.store(true, Ordering::SeqCst); } else { debug!("Device doesn't need activation"); } From 2fc15cb19e0cb439c78395f0d23d71b8a6c24653 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Fri, 13 Jun 2025 16:45:02 +0200 Subject: [PATCH 48/56] refactor(vm): move ResourceAllocator inside Vm ResourceAllocator object was part of DeviceManager since it is (mainly) devices that use it. ResourceAllocator is as well the object that implements (in a dummy way, for the moment) the DeviceRelocation trait which PciDevices use to move the address space of a PciDevice when triggered from the guest. Problem with DeviceRelocation is that it also needs the Vm file descriptor to perform the relocation, because we need to move register the new IO event fd for VirtIO devices. To make things simpler, move ResourceAllocator inside the Vm object. In subsequent commit we will remove the DeviceRelocation from ResourceAllocator and move it to Vm instead. This has the nice secondary effect that we were able to simplify the signature of many device-related methods that received Vm and ResourceAllocator arguments. Signed-off-by: Babis Chalios --- src/vmm/src/acpi/mod.rs | 50 ++++++++--------- src/vmm/src/arch/aarch64/fdt.rs | 9 +--- src/vmm/src/arch/x86_64/mod.rs | 11 ++-- src/vmm/src/arch/x86_64/mptable.rs | 2 +- src/vmm/src/arch/x86_64/vm.rs | 5 ++ src/vmm/src/builder.rs | 21 ++++---- src/vmm/src/device_manager/legacy.rs | 11 ++-- src/vmm/src/device_manager/mmio.rs | 48 ++++------------- src/vmm/src/device_manager/mod.rs | 54 +++++-------------- src/vmm/src/device_manager/pci_mngr.rs | 29 +++------- src/vmm/src/device_manager/persist.rs | 24 ++------- src/vmm/src/devices/acpi/vmgenid.rs | 2 +- src/vmm/src/devices/pci/pci_segment.rs | 2 +- .../devices/virtio/transport/pci/device.rs | 6 +-- src/vmm/src/lib.rs | 4 +- src/vmm/src/vstate/mod.rs | 2 + .../{device_manager => vstate}/resources.rs | 17 +++--- src/vmm/src/vstate/vm.rs | 20 +++---- 18 files changed, 113 insertions(+), 204 deletions(-) rename src/vmm/src/{device_manager => vstate}/resources.rs (96%) diff --git a/src/vmm/src/acpi/mod.rs b/src/vmm/src/acpi/mod.rs index a3e471aed9e..51711d9eb92 100644 --- a/src/vmm/src/acpi/mod.rs +++ b/src/vmm/src/acpi/mod.rs @@ -12,8 +12,8 @@ use crate::acpi::x86_64::{ }; use crate::arch::x86_64::layout; use crate::device_manager::DeviceManager; -use crate::device_manager::resources::ResourceAllocator; use crate::vstate::memory::{GuestAddress, GuestMemoryMmap}; +use crate::vstate::resources::ResourceAllocator; mod x86_64; @@ -80,7 +80,11 @@ impl AcpiTableWriter<'_> { } /// Build the DSDT table for the guest - fn build_dsdt(&mut self, device_manager: &mut DeviceManager) -> Result { + fn build_dsdt( + &mut self, + device_manager: &mut DeviceManager, + resource_allocator: &ResourceAllocator, + ) -> Result { let mut dsdt_data = Vec::new(); // Virtio-devices DSDT data @@ -99,7 +103,7 @@ impl AcpiTableWriter<'_> { setup_arch_dsdt(&mut dsdt_data)?; let mut dsdt = Dsdt::new(OEM_ID, *b"FCVMDSDT", OEM_REVISION, dsdt_data); - self.write_acpi_table(&device_manager.resource_allocator, &mut dsdt) + self.write_acpi_table(resource_allocator, &mut dsdt) } /// Build the FADT table for the guest @@ -193,26 +197,16 @@ impl AcpiTableWriter<'_> { pub(crate) fn create_acpi_tables( mem: &GuestMemoryMmap, device_manager: &mut DeviceManager, + resource_allocator: &ResourceAllocator, vcpus: &[Vcpu], ) -> Result<(), AcpiError> { let mut writer = AcpiTableWriter { mem }; - let dsdt_addr = writer.build_dsdt(device_manager)?; - - let fadt_addr = writer.build_fadt(&device_manager.resource_allocator, dsdt_addr)?; - let madt_addr = writer.build_madt( - &device_manager.resource_allocator, - vcpus.len().try_into().unwrap(), - )?; - let mcfg_addr = writer.build_mcfg( - &device_manager.resource_allocator, - layout::PCI_MMCONFIG_START, - )?; - let xsdt_addr = writer.build_xsdt( - &device_manager.resource_allocator, - fadt_addr, - madt_addr, - mcfg_addr, - )?; + let dsdt_addr = writer.build_dsdt(device_manager, resource_allocator)?; + + let fadt_addr = writer.build_fadt(resource_allocator, dsdt_addr)?; + let madt_addr = writer.build_madt(resource_allocator, vcpus.len().try_into().unwrap())?; + let mcfg_addr = writer.build_mcfg(resource_allocator, layout::PCI_MMCONFIG_START)?; + let xsdt_addr = writer.build_xsdt(resource_allocator, fadt_addr, madt_addr, mcfg_addr)?; writer.build_rsdp(xsdt_addr) } @@ -224,8 +218,8 @@ mod tests { use crate::acpi::{AcpiError, AcpiTableWriter}; use crate::arch::x86_64::layout::{SYSTEM_MEM_SIZE, SYSTEM_MEM_START}; use crate::builder::tests::default_vmm; - use crate::device_manager::resources::ResourceAllocator; use crate::utils::u64_to_usize; + use crate::vstate::resources::ResourceAllocator; use crate::vstate::vm::tests::setup_vm_with_memory; struct MockSdt(Vec); @@ -259,14 +253,14 @@ mod tests { // This should succeed let mut sdt = MockSdt(vec![0; 4096]); let addr = writer - .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START); // Let's try to write two 4K pages plus one byte let mut sdt = MockSdt(vec![0; usize::try_from(SYSTEM_MEM_SIZE + 1).unwrap()]); let err = writer - .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) .unwrap_err(); assert!( matches!( @@ -281,27 +275,27 @@ mod tests { // succeed. let mut sdt = MockSdt(vec![0; 5]); let addr = writer - .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4096); let mut sdt = MockSdt(vec![0; 2]); let addr = writer - .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4101); let mut sdt = MockSdt(vec![0; 4]); let addr = writer - .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4103); let mut sdt = MockSdt(vec![0; 8]); let addr = writer - .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4107); let mut sdt = MockSdt(vec![0; 16]); let addr = writer - .write_acpi_table(&vmm.device_manager.resource_allocator, &mut sdt) + .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4115); } diff --git a/src/vmm/src/arch/aarch64/fdt.rs b/src/vmm/src/arch/aarch64/fdt.rs index a2a4992eb29..0073d7dbc05 100644 --- a/src/vmm/src/arch/aarch64/fdt.rs +++ b/src/vmm/src/arch/aarch64/fdt.rs @@ -536,14 +536,7 @@ mod tests { let dummy = Arc::new(Mutex::new(DummyDevice::new())); device_manager .mmio_devices - .register_virtio_test_device( - &vm, - mem.clone(), - &device_manager.resource_allocator, - dummy, - &mut cmdline, - "dummy", - ) + .register_virtio_test_device(&vm, mem.clone(), dummy, &mut cmdline, "dummy") .unwrap(); create_fdt( diff --git a/src/vmm/src/arch/x86_64/mod.rs b/src/vmm/src/arch/x86_64/mod.rs index 68b903d5ff6..5307dbdf710 100644 --- a/src/vmm/src/arch/x86_64/mod.rs +++ b/src/vmm/src/arch/x86_64/mod.rs @@ -217,7 +217,7 @@ pub fn configure_system_for_boot( // Note that this puts the mptable at the last 1k of Linux's 640k base RAM mptable::setup_mptable( vm.guest_memory(), - &device_manager.resource_allocator, + &vm.common.resource_allocator, vcpu_config.vcpu_count, ) .map_err(ConfigurationError::MpTableSetup)?; @@ -238,7 +238,12 @@ pub fn configure_system_for_boot( // Create ACPI tables and write them in guest memory // For the time being we only support ACPI in x86_64 - create_acpi_tables(vm.guest_memory(), device_manager, vcpus)?; + create_acpi_tables( + vm.guest_memory(), + device_manager, + &vm.common.resource_allocator, + vcpus, + )?; Ok(()) } @@ -568,9 +573,9 @@ mod tests { use linux_loader::loader::bootparam::boot_e820_entry; use super::*; - use crate::device_manager::resources::ResourceAllocator; use crate::test_utils::{arch_mem, single_region_mem}; use crate::utils::mib_to_bytes; + use crate::vstate::resources::ResourceAllocator; #[test] fn regions_lt_4gb() { diff --git a/src/vmm/src/arch/x86_64/mptable.rs b/src/vmm/src/arch/x86_64/mptable.rs index c397290c23e..17b2900aeb2 100644 --- a/src/vmm/src/arch/x86_64/mptable.rs +++ b/src/vmm/src/arch/x86_64/mptable.rs @@ -15,10 +15,10 @@ use vm_allocator::AllocPolicy; use crate::arch::IRQ_MAX; use crate::arch::x86_64::generated::mpspec; -use crate::device_manager::resources::ResourceAllocator; use crate::vstate::memory::{ Address, ByteValued, Bytes, GuestAddress, GuestMemory, GuestMemoryMmap, }; +use crate::vstate::resources::ResourceAllocator; // These `mpspec` wrapper types are only data, reading them from data is a safe initialization. // SAFETY: POD diff --git a/src/vmm/src/arch/x86_64/vm.rs b/src/vmm/src/arch/x86_64/vm.rs index e84b4338e35..9d22bf9a757 100644 --- a/src/vmm/src/arch/x86_64/vm.rs +++ b/src/vmm/src/arch/x86_64/vm.rs @@ -11,8 +11,10 @@ use kvm_ioctls::Cap; use serde::{Deserialize, Serialize}; use crate::arch::x86_64::msr::MsrError; +use crate::snapshot::Persist; use crate::utils::u64_to_usize; use crate::vstate::memory::{GuestMemoryExtension, GuestMemoryState}; +use crate::vstate::resources::ResourceAllocatorState; use crate::vstate::vm::{VmCommon, VmError}; /// Error type for [`Vm::restore_state`] @@ -187,6 +189,7 @@ impl ArchVm { Ok(VmState { memory: self.common.guest_memory.describe(), + resource_allocator: self.common.resource_allocator.save(), pitstate, clock, pic_master, @@ -211,6 +214,8 @@ impl ArchVm { pub struct VmState { /// guest memory state pub memory: GuestMemoryState, + /// resource allocator + pub resource_allocator: ResourceAllocatorState, pitstate: kvm_pit_state2, clock: kvm_clock_data, // TODO: rename this field to adopt inclusive language once Linux updates it, too. diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index d8d69a97314..4b998fdf138 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -47,6 +47,8 @@ use crate::vmm_config::instance_info::InstanceInfo; use crate::vmm_config::machine_config::MachineConfigError; use crate::vstate::kvm::{Kvm, KvmError}; use crate::vstate::memory::GuestRegionMmap; +#[cfg(target_arch = "aarch64")] +use crate::vstate::resources::ResourceAllocator; use crate::vstate::vcpu::VcpuError; use crate::vstate::vm::{Vm, VmError}; use crate::{EventManager, Vmm, VmmError}; @@ -188,7 +190,7 @@ pub fn build_microvm_for_boot( .collect::, _>>()?; if vm_resources.pci_enabled { - device_manager.enable_pci()?; + device_manager.enable_pci(&vm)?; } else { boot_cmdline.insert("pci", "off")?; } @@ -197,7 +199,7 @@ pub fn build_microvm_for_boot( // to maintain the same MMIO address referenced in the documentation // and tests. if vm_resources.boot_timer { - device_manager.attach_boot_timer_device(request_ts)?; + device_manager.attach_boot_timer_device(&vm, request_ts)?; } if let Some(balloon) = vm_resources.balloon.get() { @@ -252,7 +254,7 @@ pub fn build_microvm_for_boot( #[cfg(target_arch = "aarch64")] if vcpus[0].kvm_vcpu.supports_pvtime() { - setup_pvtime(&mut device_manager, &mut vcpus)?; + setup_pvtime(&vm.common.resource_allocator, &mut vcpus)?; } else { log::warn!("Vcpus do not support pvtime, steal time will not be reported to guest"); } @@ -513,13 +515,12 @@ const STEALTIME_STRUCT_MEM_SIZE: u64 = 64; /// Helper method to allocate steal time region #[cfg(target_arch = "aarch64")] fn allocate_pvtime_region( - device_manager: &mut DeviceManager, + resource_allocator: &ResourceAllocator, vcpu_count: usize, policy: vm_allocator::AllocPolicy, ) -> Result { let size = STEALTIME_STRUCT_MEM_SIZE * vcpu_count as u64; - let addr = device_manager - .resource_allocator + let addr = resource_allocator .allocate_system_memory(size, STEALTIME_STRUCT_MEM_SIZE, policy) .map_err(StartMicrovmError::AllocateResources)?; Ok(GuestAddress(addr)) @@ -528,12 +529,12 @@ fn allocate_pvtime_region( /// Sets up pvtime for all vcpus #[cfg(target_arch = "aarch64")] fn setup_pvtime( - device_manager: &mut DeviceManager, + resource_allocator: &ResourceAllocator, vcpus: &mut [Vcpu], ) -> Result<(), StartMicrovmError> { // Alloc sys mem for steal time region let pvtime_mem: GuestAddress = allocate_pvtime_region( - device_manager, + resource_allocator, vcpus.len(), vm_allocator::AllocPolicy::LastMatch, )?; @@ -1141,7 +1142,9 @@ pub(crate) mod tests { let mut vmm = default_vmm(); let request_ts = TimestampUs::default(); - let res = vmm.device_manager.attach_boot_timer_device(request_ts); + let res = vmm + .device_manager + .attach_boot_timer_device(&vmm.vm, request_ts); res.unwrap(); assert!(vmm.device_manager.mmio_devices.boot_timer.is_some()); } diff --git a/src/vmm/src/device_manager/legacy.rs b/src/vmm/src/device_manager/legacy.rs index 7011ae71122..47b259ef87b 100644 --- a/src/vmm/src/device_manager/legacy.rs +++ b/src/vmm/src/device_manager/legacy.rs @@ -97,11 +97,7 @@ impl PortIODeviceManager { } /// Register supported legacy devices. - pub fn register_devices( - &mut self, - io_bus: &vm_device::Bus, - vm: &Vm, - ) -> Result<(), LegacyDeviceError> { + pub fn register_devices(&mut self, vm: &Vm) -> Result<(), LegacyDeviceError> { let serial_2_4 = Arc::new(Mutex::new(SerialDevice { serial: Serial::with_events( self.com_evt_2_4.try_clone()?.try_clone()?, @@ -122,6 +118,8 @@ impl PortIODeviceManager { ), input: None, })); + + let io_bus = &vm.common.resource_allocator.pio_bus; io_bus.insert( self.stdio_serial.clone(), Self::SERIAL_PORT_ADDRESSES[0], @@ -243,7 +241,6 @@ mod tests { #[test] fn test_register_legacy_devices() { let (_, vm) = setup_vm_with_memory(0x1000); - let io_bus = vm_device::Bus::new(); vm.setup_irqchip().unwrap(); let mut ldm = PortIODeviceManager::new( Arc::new(Mutex::new(SerialDevice { @@ -261,6 +258,6 @@ mod tests { )), ) .unwrap(); - ldm.register_devices(&io_bus, &vm).unwrap(); + ldm.register_devices(&vm).unwrap(); } } diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 2d6cde39c52..da32cf14271 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -19,7 +19,6 @@ use log::debug; use serde::{Deserialize, Serialize}; use vm_allocator::AllocPolicy; -use super::resources::ResourceAllocator; use crate::Vm; use crate::arch::BOOT_DEVICE_MEM_START; #[cfg(target_arch = "aarch64")] @@ -31,6 +30,7 @@ use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::transport::mmio::MmioTransport; #[cfg(target_arch = "x86_64")] use crate::vstate::memory::GuestAddress; +use crate::vstate::resources::ResourceAllocator; /// Errors for MMIO device manager. #[derive(Debug, thiserror::Error, displaydoc::Display)] @@ -180,7 +180,6 @@ impl MMIODeviceManager { &mut self, vm: &Vm, device_id: String, - mmio_bus: &vm_device::Bus, device: MMIODevice, ) -> Result<(), MmioError> { // Our virtio devices are currently hardcoded to use a single IRQ. @@ -203,7 +202,7 @@ impl MMIODeviceManager { .map_err(MmioError::RegisterIrqFd)?; } - mmio_bus.insert( + vm.common.resource_allocator.mmio_bus.insert( device.inner.clone(), device.resources.addr, device.resources.len, @@ -239,13 +238,12 @@ impl MMIODeviceManager { pub fn register_mmio_virtio_for_boot( &mut self, vm: &Vm, - resource_allocator: &ResourceAllocator, device_id: String, mmio_device: MmioTransport, _cmdline: &mut kernel_cmdline::Cmdline, ) -> Result<(), MmioError> { let device = MMIODevice { - resources: self.allocate_mmio_resources(resource_allocator, 1)?, + resources: self.allocate_mmio_resources(&vm.common.resource_allocator, 1)?, inner: Arc::new(Mutex::new(mmio_device)), }; @@ -261,7 +259,7 @@ impl MMIODeviceManager { device.resources.irq.unwrap().get(), )?; } - self.register_mmio_virtio(vm, device_id, &resource_allocator.mmio_bus, device)?; + self.register_mmio_virtio(vm, device_id, device)?; Ok(()) } @@ -271,7 +269,6 @@ impl MMIODeviceManager { pub fn register_mmio_serial( &mut self, vm: &Vm, - resource_allocator: &ResourceAllocator, serial: Arc>, device_info_opt: Option, ) -> Result<(), MmioError> { @@ -280,7 +277,7 @@ impl MMIODeviceManager { let device_info = if let Some(device_info) = device_info_opt { device_info } else { - let gsi = resource_allocator.allocate_gsi(1)?; + let gsi = vm.common.resource_allocator.allocate_gsi(1)?; MMIODeviceInfo { addr: SERIAL_MEM_START, len: MMIO_LEN, @@ -299,7 +296,7 @@ impl MMIODeviceManager { inner: serial, }; - resource_allocator.mmio_bus.insert( + vm.common.resource_allocator.mmio_bus.insert( device.inner.clone(), device.resources.addr, device.resources.len, @@ -481,20 +478,13 @@ pub(crate) mod tests { &mut self, vm: &Vm, guest_mem: GuestMemoryMmap, - resource_allocator: &ResourceAllocator, device: Arc>, cmdline: &mut kernel_cmdline::Cmdline, dev_id: &str, ) -> Result { let interrupt = Arc::new(IrqTrigger::new()); let mmio_device = MmioTransport::new(guest_mem, interrupt, device.clone(), false); - self.register_mmio_virtio_for_boot( - vm, - resource_allocator, - dev_id.to_string(), - mmio_device, - cmdline, - )?; + self.register_mmio_virtio_for_boot(vm, dev_id.to_string(), mmio_device, cmdline)?; Ok(self .get_virtio_device(device.lock().unwrap().device_type(), dev_id) .unwrap() @@ -601,7 +591,6 @@ pub(crate) mod tests { let mut vm = Vm::new(&kvm).unwrap(); vm.register_memory_regions(guest_mem).unwrap(); let mut device_manager = MMIODeviceManager::new(); - let resource_allocator = ResourceAllocator::new().unwrap(); let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); let dummy = Arc::new(Mutex::new(DummyDevice::new())); @@ -614,7 +603,6 @@ pub(crate) mod tests { .register_virtio_test_device( &vm, vm.guest_memory().clone(), - &resource_allocator, dummy, &mut cmdline, "dummy", @@ -655,7 +643,6 @@ pub(crate) mod tests { let mut vm = Vm::new(&kvm).unwrap(); vm.register_memory_regions(guest_mem).unwrap(); let mut device_manager = MMIODeviceManager::new(); - let resource_allocator = ResourceAllocator::new().unwrap(); let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); #[cfg(target_arch = "x86_64")] @@ -668,7 +655,6 @@ pub(crate) mod tests { .register_virtio_test_device( &vm, vm.guest_memory().clone(), - &resource_allocator, Arc::new(Mutex::new(DummyDevice::new())), &mut cmdline, "dummy1", @@ -682,7 +668,6 @@ pub(crate) mod tests { .register_virtio_test_device( &vm, vm.guest_memory().clone(), - &resource_allocator, Arc::new(Mutex::new(DummyDevice::new())), &mut cmdline, "dummy2" @@ -717,21 +702,13 @@ pub(crate) mod tests { vm.setup_irqchip(1).unwrap(); let mut device_manager = MMIODeviceManager::new(); - let resource_allocator = ResourceAllocator::new().unwrap(); let mut cmdline = kernel_cmdline::Cmdline::new(4096).unwrap(); let dummy = Arc::new(Mutex::new(DummyDevice::new())); let type_id = dummy.lock().unwrap().device_type(); let id = String::from("foo"); let addr = device_manager - .register_virtio_test_device( - &vm, - vm.guest_memory().clone(), - &resource_allocator, - dummy, - &mut cmdline, - &id, - ) + .register_virtio_test_device(&vm, vm.guest_memory().clone(), dummy, &mut cmdline, &id) .unwrap(); assert!(device_manager.get_virtio_device(type_id, &id).is_some()); assert_eq!( @@ -755,14 +732,7 @@ pub(crate) mod tests { let dummy2 = Arc::new(Mutex::new(DummyDevice::new())); let id2 = String::from("foo2"); device_manager - .register_virtio_test_device( - &vm, - vm.guest_memory().clone(), - &resource_allocator, - dummy2, - &mut cmdline, - &id2, - ) + .register_virtio_test_device(&vm, vm.guest_memory().clone(), dummy2, &mut cmdline, &id2) .unwrap(); let mut count = 0; diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index 260f3337673..f037a4a8d05 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -18,7 +18,6 @@ use log::{error, info}; use mmio::{MMIODeviceManager, MmioError}; use pci_mngr::{PciDevices, PciDevicesConstructorArgs, PciManagerError}; use persist::{ACPIDeviceManagerConstructorArgs, MMIODevManagerConstructorArgs}; -use resources::ResourceAllocator; use serde::{Deserialize, Serialize}; use utils::time::TimestampUs; use vmm_sys_util::eventfd::EventFd; @@ -54,8 +53,6 @@ pub mod mmio; pub mod pci_mngr; /// Device managers (de)serialization support. pub mod persist; -/// Resource manager for devices. -pub mod resources; #[derive(Debug, thiserror::Error, displaydoc::Display)] /// Error while creating a new [`DeviceManager`] @@ -93,8 +90,6 @@ pub enum AttachDeviceError { #[derive(Debug)] /// A manager of all peripheral devices of Firecracker pub struct DeviceManager { - /// Allocator for system memory and interrupt numbers - pub resource_allocator: Arc, /// MMIO devices pub mmio_devices: MMIODeviceManager, #[cfg(target_arch = "x86_64")] @@ -139,7 +134,6 @@ impl DeviceManager { event_manager: &mut EventManager, vcpus_exit_evt: &EventFd, vm: &Vm, - resource_allocator: &ResourceAllocator, ) -> Result { Self::set_stdout_nonblocking(); @@ -153,7 +147,7 @@ impl DeviceManager { // create pio dev manager with legacy devices let mut legacy_devices = PortIODeviceManager::new(serial, i8042)?; - legacy_devices.register_devices(&resource_allocator.pio_bus, vm)?; + legacy_devices.register_devices(vm)?; Ok(legacy_devices) } @@ -163,13 +157,10 @@ impl DeviceManager { vcpus_exit_evt: &EventFd, vm: &Vm, ) -> Result { - let resource_allocator = Arc::new(ResourceAllocator::new()?); #[cfg(target_arch = "x86_64")] - let legacy_devices = - Self::create_legacy_devices(event_manager, vcpus_exit_evt, vm, &resource_allocator)?; + let legacy_devices = Self::create_legacy_devices(event_manager, vcpus_exit_evt, vm)?; Ok(DeviceManager { - resource_allocator, mmio_devices: MMIODeviceManager::new(), #[cfg(target_arch = "x86_64")] legacy_devices, @@ -193,13 +184,8 @@ impl DeviceManager { // The device mutex mustn't be locked here otherwise it will deadlock. let device = MmioTransport::new(vm.guest_memory().clone(), interrupt, device, is_vhost_user); - self.mmio_devices.register_mmio_virtio_for_boot( - vm, - &self.resource_allocator, - id, - device, - cmdline, - )?; + self.mmio_devices + .register_mmio_virtio_for_boot(vm, id, device, cmdline)?; Ok(()) } @@ -214,8 +200,7 @@ impl DeviceManager { is_vhost_user: bool, ) -> Result<(), AttachDeviceError> { if self.pci_devices.pci_segment.is_some() { - self.pci_devices - .attach_pci_virtio_device(vm, &self.resource_allocator, id, device)?; + self.pci_devices.attach_pci_virtio_device(vm, id, device)?; } else { self.attach_mmio_virtio_device(vm, id, device, cmdline, is_vhost_user)?; } @@ -226,12 +211,13 @@ impl DeviceManager { /// Attaches a [`BootTimer`] to the VM pub(crate) fn attach_boot_timer_device( &mut self, + vm: &Vm, request_ts: TimestampUs, ) -> Result<(), AttachDeviceError> { let boot_timer = Arc::new(Mutex::new(BootTimer::new(request_ts))); self.mmio_devices - .register_mmio_boot_timer(&self.resource_allocator.mmio_bus, boot_timer)?; + .register_mmio_boot_timer(&vm.common.resource_allocator.mmio_bus, boot_timer)?; Ok(()) } @@ -241,7 +227,7 @@ impl DeviceManager { mem: &GuestMemoryMmap, vm: &Vm, ) -> Result<(), AttachDeviceError> { - let vmgenid = VmGenId::new(mem, &self.resource_allocator)?; + let vmgenid = VmGenId::new(mem, &vm.common.resource_allocator)?; self.acpi_devices.attach_vmgenid(vmgenid, vm)?; Ok(()) } @@ -265,21 +251,19 @@ impl DeviceManager { // Make stdout non-blocking. Self::set_stdout_nonblocking(); let serial = Self::setup_serial_device(event_manager)?; - self.mmio_devices - .register_mmio_serial(vm, &self.resource_allocator, serial, None)?; + self.mmio_devices.register_mmio_serial(vm, serial, None)?; self.mmio_devices.add_mmio_serial_to_cmdline(cmdline)?; } let rtc = Arc::new(Mutex::new(RTCDevice::new())); self.mmio_devices - .register_mmio_rtc(&self.resource_allocator, rtc, None)?; + .register_mmio_rtc(&vm.common.resource_allocator, rtc, None)?; Ok(()) } /// Enables PCIe support for Firecracker devices - pub fn enable_pci(&mut self) -> Result<(), PciManagerError> { - self.pci_devices - .attach_pci_segment(&self.resource_allocator) + pub fn enable_pci(&mut self, vm: &Arc) -> Result<(), PciManagerError> { + self.pci_devices.attach_pci_segment(vm) } fn do_kick_device(virtio_device: Arc>) { @@ -397,8 +381,6 @@ impl DeviceManager { #[derive(Debug, Default, Clone, Serialize, Deserialize)] /// State of devices in the system pub struct DevicesState { - /// Resource allocator state - pub resource_allocator_state: resources::ResourceAllocatorState, /// MMIO devices state pub mmio_state: persist::DeviceStates, /// ACPI devices state @@ -454,7 +436,6 @@ impl<'a> Persist<'a> for DeviceManager { fn save(&self) -> Self::State { DevicesState { - resource_allocator_state: self.resource_allocator.save(), mmio_state: self.mmio_devices.save(), acpi_state: self.acpi_devices.save(), pci_state: self.pci_devices.save(), @@ -465,17 +446,12 @@ impl<'a> Persist<'a> for DeviceManager { constructor_args: Self::ConstructorArgs, state: &Self::State, ) -> std::result::Result { - // Safe to unwrap here. ResourceAllocator restoring cannot fail. - let resource_allocator = - Arc::new(ResourceAllocator::restore((), &state.resource_allocator_state).unwrap()); - // Setup legacy devices in case of x86 #[cfg(target_arch = "x86_64")] let legacy_devices = Self::create_legacy_devices( constructor_args.event_manager, constructor_args.vcpus_exit_evt, constructor_args.vm, - &resource_allocator, )?; // Restore MMIO devices @@ -483,7 +459,6 @@ impl<'a> Persist<'a> for DeviceManager { mem: constructor_args.mem, vm: constructor_args.vm, event_manager: constructor_args.event_manager, - resource_allocator: &resource_allocator, vm_resources: constructor_args.vm_resources, instance_id: constructor_args.instance_id, restored_from_file: constructor_args.restored_from_file, @@ -493,7 +468,6 @@ impl<'a> Persist<'a> for DeviceManager { // Restore ACPI devices let acpi_ctor_args = ACPIDeviceManagerConstructorArgs { mem: constructor_args.mem, - resource_allocator: &resource_allocator, vm: constructor_args.vm, }; let mut acpi_devices = ACPIDeviceManager::restore(acpi_ctor_args, &state.acpi_state)?; @@ -501,7 +475,6 @@ impl<'a> Persist<'a> for DeviceManager { // Restore PCI devices let pci_ctor_args = PciDevicesConstructorArgs { - resource_allocator: &resource_allocator, vm: constructor_args.vm.clone(), mem: constructor_args.mem, vm_resources: constructor_args.vm_resources, @@ -512,7 +485,6 @@ impl<'a> Persist<'a> for DeviceManager { let pci_devices = PciDevices::restore(pci_ctor_args, &state.pci_state)?; let device_manager = DeviceManager { - resource_allocator, mmio_devices, #[cfg(target_arch = "x86_64")] legacy_devices, @@ -578,7 +550,6 @@ pub(crate) mod tests { let mmio_devices = MMIODeviceManager::new(); let acpi_devices = ACPIDeviceManager::new(); let pci_devices = PciDevices::new(); - let resource_allocator = Arc::new(ResourceAllocator::new().unwrap()); #[cfg(target_arch = "x86_64")] let legacy_devices = PortIODeviceManager::new( @@ -592,7 +563,6 @@ pub(crate) mod tests { .unwrap(); DeviceManager { - resource_allocator, mmio_devices, #[cfg(target_arch = "x86_64")] legacy_devices, diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs index 26a44dd29c9..199c6ec3c7c 100644 --- a/src/vmm/src/device_manager/pci_mngr.rs +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -12,7 +12,6 @@ use serde::{Deserialize, Serialize}; use vm_device::BusError; use super::persist::{MmdsVersionState, SharedDeviceType}; -use crate::device_manager::resources::ResourceAllocator; use crate::devices::pci::PciSegment; use crate::devices::virtio::balloon::Balloon; use crate::devices::virtio::balloon::persist::{BalloonConstructorArgs, BalloonState}; @@ -34,6 +33,7 @@ use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET, TYPE_RNG}; use crate::resources::VmResources; use crate::snapshot::Persist; use crate::vstate::memory::GuestMemoryMmap; +use crate::vstate::resources::ResourceAllocator; use crate::vstate::vm::{InterruptError, MsiVectorGroup}; use crate::{EventManager, Vm}; @@ -68,17 +68,14 @@ impl PciDevices { Default::default() } - pub fn attach_pci_segment( - &mut self, - resource_allocator: &Arc, - ) -> Result<(), PciManagerError> { + pub fn attach_pci_segment(&mut self, vm: &Arc) -> Result<(), PciManagerError> { // We only support a single PCIe segment. Calling this function twice is a Firecracker // internal error. assert!(self.pci_segment.is_none()); // Currently we don't assign any IRQs to PCI devices. We will be using MSI-X interrupts // only. - let pci_segment = PciSegment::new(0, resource_allocator, &[0u8; 32])?; + let pci_segment = PciSegment::new(0, &vm.common.resource_allocator, &[0u8; 32])?; self.pci_segment = Some(pci_segment); Ok(()) @@ -128,7 +125,6 @@ impl PciDevices { >( &mut self, vm: &Arc, - resource_allocator: &ResourceAllocator, id: String, device: Arc>, ) -> Result<(), PciManagerError> { @@ -137,17 +133,14 @@ impl PciDevices { let pci_device_bdf = pci_segment.next_device_bdf()?; debug!("Allocating BDF: {pci_device_bdf:?} for device"); let mem = vm.guest_memory().clone(); + let resource_allocator = &vm.common.resource_allocator; let device_type: u32 = device.lock().expect("Poisoned lock").device_type(); // Allocate one MSI vector per queue, plus one for configuration let msix_num = u16::try_from(device.lock().expect("Poisoned lock").queues().len() + 1).unwrap(); - let msix_vectors = Arc::new(Vm::create_msix_group( - vm.clone(), - resource_allocator, - msix_num, - )?); + let msix_vectors = Arc::new(Vm::create_msix_group(vm.clone(), msix_num)?); // Create the transport let mut virtio_device = @@ -187,7 +180,6 @@ impl PciDevices { fn restore_pci_device( &mut self, vm: &Arc, - resource_allocator: &ResourceAllocator, device: Arc>, device_id: &str, transport_state: &VirtioPciDeviceState, @@ -221,7 +213,7 @@ impl PciDevices { self.virtio_devices .insert((device_type, device_id.to_string()), virtio_device.clone()); - Self::register_bars_with_bus(resource_allocator, &virtio_device)?; + Self::register_bars_with_bus(&vm.common.resource_allocator, &virtio_device)?; virtio_device .lock() .expect("Poisoned lock") @@ -276,7 +268,6 @@ pub struct PciDevicesState { pub struct PciDevicesConstructorArgs<'a> { pub vm: Arc, pub mem: &'a GuestMemoryMmap, - pub resource_allocator: &'a Arc, pub vm_resources: &'a mut VmResources, pub instance_id: &'a str, pub restored_from_file: bool, @@ -288,7 +279,6 @@ impl<'a> Debug for PciDevicesConstructorArgs<'a> { f.debug_struct("PciDevicesConstructorArgs") .field("vm", &self.vm) .field("mem", &self.mem) - .field("resource_allocator", &self.resource_allocator) .field("vm_resources", &self.vm_resources) .field("instance_id", &self.instance_id) .field("restored_from_file", &self.restored_from_file) @@ -437,7 +427,7 @@ impl<'a> Persist<'a> for PciDevices { return Ok(pci_devices); } - pci_devices.attach_pci_segment(constructor_args.resource_allocator)?; + pci_devices.attach_pci_segment(&constructor_args.vm)?; if let Some(balloon_state) = &state.balloon_device { let device = Arc::new(Mutex::new( @@ -459,7 +449,6 @@ impl<'a> Persist<'a> for PciDevices { pci_devices .restore_pci_device( &constructor_args.vm, - constructor_args.resource_allocator, device, &balloon_state.device_id, &balloon_state.transport_state, @@ -485,7 +474,6 @@ impl<'a> Persist<'a> for PciDevices { pci_devices .restore_pci_device( &constructor_args.vm, - constructor_args.resource_allocator, device, &block_state.device_id, &block_state.transport_state, @@ -536,7 +524,6 @@ impl<'a> Persist<'a> for PciDevices { pci_devices .restore_pci_device( &constructor_args.vm, - constructor_args.resource_allocator, device, &net_state.device_id, &net_state.transport_state, @@ -570,7 +557,6 @@ impl<'a> Persist<'a> for PciDevices { pci_devices .restore_pci_device( &constructor_args.vm, - constructor_args.resource_allocator, device, &vsock_state.device_id, &vsock_state.transport_state, @@ -594,7 +580,6 @@ impl<'a> Persist<'a> for PciDevices { pci_devices .restore_pci_device( &constructor_args.vm, - constructor_args.resource_allocator, device, &entropy_state.device_id, &entropy_state.transport_state, diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 93385805e7b..24a0d3cca3d 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -12,7 +12,6 @@ use serde::{Deserialize, Serialize}; use super::acpi::ACPIDeviceManager; use super::mmio::*; -use super::resources::ResourceAllocator; #[cfg(target_arch = "aarch64")] use crate::arch::DeviceType; use crate::devices::acpi::vmgenid::{VMGenIDState, VMGenIdConstructorArgs, VmGenId, VmGenIdError}; @@ -165,7 +164,6 @@ pub struct MMIODevManagerConstructorArgs<'a> { pub mem: &'a GuestMemoryMmap, pub vm: &'a Vm, pub event_manager: &'a mut EventManager, - pub resource_allocator: &'a ResourceAllocator, pub vm_resources: &'a mut VmResources, pub instance_id: &'a str, pub restored_from_file: bool, @@ -191,7 +189,6 @@ pub struct ACPIDeviceManagerState { #[derive(Debug)] pub struct ACPIDeviceManagerConstructorArgs<'a> { pub mem: &'a GuestMemoryMmap, - pub resource_allocator: &'a ResourceAllocator, pub vm: &'a Vm, } @@ -223,7 +220,7 @@ impl<'a> Persist<'a> for ACPIDeviceManager { let vmgenid = VmGenId::restore( VMGenIdConstructorArgs { mem: constructor_args.mem, - resource_allocator: constructor_args.resource_allocator, + resource_allocator: &constructor_args.vm.common.resource_allocator, }, vmgenid_args, )?; @@ -387,17 +384,12 @@ impl<'a> Persist<'a> for MMIODeviceManager { .event_manager .add_subscriber(serial.clone()); - dev_manager.register_mmio_serial( - vm, - constructor_args.resource_allocator, - serial, - Some(state.device_info), - )?; + dev_manager.register_mmio_serial(vm, serial, Some(state.device_info))?; } if state.type_ == DeviceType::Rtc { let rtc = Arc::new(Mutex::new(RTCDevice::new())); dev_manager.register_mmio_rtc( - constructor_args.resource_allocator, + &constructor_args.vm.common.resource_allocator, rtc, Some(state.device_info), )?; @@ -412,7 +404,6 @@ impl<'a> Persist<'a> for MMIODeviceManager { id: &String, state: &MmioTransportState, device_info: &MMIODeviceInfo, - mmio_bus: &vm_device::Bus, event_manager: &mut EventManager| -> Result<(), Self::Error> { let interrupt = Arc::new(IrqTrigger::new()); @@ -430,7 +421,6 @@ impl<'a> Persist<'a> for MMIODeviceManager { dev_manager.register_mmio_virtio( vm, id.clone(), - mmio_bus, MMIODevice { resources: *device_info, inner: mmio_transport, @@ -469,7 +459,6 @@ impl<'a> Persist<'a> for MMIODeviceManager { &balloon_state.device_id, &balloon_state.transport_state, &balloon_state.device_info, - &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, )?; } @@ -492,7 +481,6 @@ impl<'a> Persist<'a> for MMIODeviceManager { &block_state.device_id, &block_state.transport_state, &block_state.device_info, - &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, )?; } @@ -539,7 +527,6 @@ impl<'a> Persist<'a> for MMIODeviceManager { &net_state.device_id, &net_state.transport_state, &net_state.device_info, - &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, )?; } @@ -569,7 +556,6 @@ impl<'a> Persist<'a> for MMIODeviceManager { &vsock_state.device_id, &vsock_state.transport_state, &vsock_state.device_info, - &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, )?; } @@ -594,7 +580,6 @@ impl<'a> Persist<'a> for MMIODeviceManager { &entropy_state.device_id, &entropy_state.transport_state, &entropy_state.device_info, - &constructor_args.resource_allocator.mmio_bus, constructor_args.event_manager, )?; } @@ -725,14 +710,11 @@ mod tests { let vmm = default_vmm(); let device_manager_state: device_manager::DevicesState = Snapshot::deserialize(&mut buf.as_slice()).unwrap(); - let resource_allocator = - ResourceAllocator::restore((), &device_manager_state.resource_allocator_state).unwrap(); let vm_resources = &mut VmResources::default(); let restore_args = MMIODevManagerConstructorArgs { mem: vmm.vm.guest_memory(), vm: &vmm.vm, event_manager: &mut event_manager, - resource_allocator: &resource_allocator, vm_resources, instance_id: "microvm-id", restored_from_file: true, diff --git a/src/vmm/src/devices/acpi/vmgenid.rs b/src/vmm/src/devices/acpi/vmgenid.rs index 0cf0ae0d7b1..5c8d4ecbc51 100644 --- a/src/vmm/src/devices/acpi/vmgenid.rs +++ b/src/vmm/src/devices/acpi/vmgenid.rs @@ -11,9 +11,9 @@ use vm_superio::Trigger; use vmm_sys_util::eventfd::EventFd; use super::super::legacy::EventFdTrigger; -use crate::device_manager::resources::ResourceAllocator; use crate::snapshot::Persist; use crate::vstate::memory::{Bytes, GuestMemoryMmap}; +use crate::vstate::resources::ResourceAllocator; /// Bytes of memory we allocate for VMGenID device pub const VMGENID_MEM_SIZE: u64 = 16; diff --git a/src/vmm/src/devices/pci/pci_segment.rs b/src/vmm/src/devices/pci/pci_segment.rs index c1e8bb07cb8..e957332bb0e 100644 --- a/src/vmm/src/devices/pci/pci_segment.rs +++ b/src/vmm/src/devices/pci/pci_segment.rs @@ -22,7 +22,7 @@ use vm_allocator::AddressAllocator; use vm_device::{BusDeviceSync, BusError}; use crate::arch::{PCI_MMCONFIG_START, PCI_MMIO_CONFIG_SIZE_PER_SEGMENT}; -use crate::device_manager::resources::ResourceAllocator; +use crate::vstate::resources::ResourceAllocator; pub struct PciSegment { pub(crate) id: u16, diff --git a/src/vmm/src/devices/virtio/transport/pci/device.rs b/src/vmm/src/devices/virtio/transport/pci/device.rs index 6793d502f00..384ad0358dd 100644 --- a/src/vmm/src/devices/virtio/transport/pci/device.rs +++ b/src/vmm/src/devices/virtio/transport/pci/device.rs @@ -33,7 +33,6 @@ use vmm_sys_util::errno; use vmm_sys_util::eventfd::EventFd; use crate::Vm; -use crate::device_manager::resources::ResourceAllocator; use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::queue::Queue; use crate::devices::virtio::transport::pci::common_config::{ @@ -45,6 +44,7 @@ use crate::logger::{debug, error}; use crate::snapshot::Persist; use crate::utils::u64_to_usize; use crate::vstate::memory::GuestMemoryMmap; +use crate::vstate::resources::ResourceAllocator; use crate::vstate::vm::{InterruptError, MsiVectorGroup}; const DEVICE_INIT: u8 = 0x00; @@ -1153,7 +1153,7 @@ mod tests { #[test] fn test_pci_device_config() { let mut vmm = default_vmm(); - vmm.device_manager.enable_pci(); + vmm.device_manager.enable_pci(&vmm.vm); let entropy = Arc::new(Mutex::new(Entropy::new(RateLimiter::default()).unwrap())); vmm.device_manager .attach_virtio_device( @@ -1271,7 +1271,7 @@ mod tests { #[test] fn test_reading_bars() { let mut vmm = default_vmm(); - vmm.device_manager.enable_pci(); + vmm.device_manager.enable_pci(&vmm.vm); let entropy = Arc::new(Mutex::new(Entropy::new(RateLimiter::default()).unwrap())); vmm.device_manager .attach_virtio_device( diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index d4cb5a78344..d65fcdbeed5 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -373,10 +373,10 @@ impl Vmm { self.vcpus_handles.reserve(vcpu_count); for mut vcpu in vcpus.drain(..) { - vcpu.set_mmio_bus(self.device_manager.resource_allocator.mmio_bus.clone()); + vcpu.set_mmio_bus(self.vm.common.resource_allocator.mmio_bus.clone()); #[cfg(target_arch = "x86_64")] vcpu.kvm_vcpu - .set_pio_bus(self.device_manager.resource_allocator.pio_bus.clone()); + .set_pio_bus(self.vm.common.resource_allocator.pio_bus.clone()); self.vcpus_handles .push(vcpu.start_threaded(vcpu_seccomp_filter.clone(), barrier.clone())?); diff --git a/src/vmm/src/vstate/mod.rs b/src/vmm/src/vstate/mod.rs index 47458835e04..f4fa25914d0 100644 --- a/src/vmm/src/vstate/mod.rs +++ b/src/vmm/src/vstate/mod.rs @@ -5,6 +5,8 @@ pub mod kvm; /// Module with GuestMemory implementation. pub mod memory; +/// Resource manager for devices. +pub mod resources; /// Module with Vcpu implementation. pub mod vcpu; /// Module with Vm implementation. diff --git a/src/vmm/src/device_manager/resources.rs b/src/vmm/src/vstate/resources.rs similarity index 96% rename from src/vmm/src/device_manager/resources.rs rename to src/vmm/src/vstate/resources.rs index f7035e55566..3b77b892bc3 100644 --- a/src/vmm/src/device_manager/resources.rs +++ b/src/vmm/src/vstate/resources.rs @@ -22,13 +22,13 @@ use crate::snapshot::Persist; /// * Memory allocations in the MMIO address space #[derive(Debug)] pub struct ResourceAllocator { - // Allocator for device interrupt lines + /// Allocator for device interrupt lines pub gsi_allocator: Arc>, - // Allocator for memory in the 32-bit MMIO address space + /// Allocator for memory in the 32-bit MMIO address space pub mmio32_memory: Arc>, - // Allocator for memory in the 64-bit MMIO address space + /// Allocator for memory in the 64-bit MMIO address space pub mmio64_memory: Arc>, - // Memory allocator for system data + /// Memory allocator for system data pub system_memory: Arc>, /// MMIO bus pub mmio_bus: Arc, @@ -186,14 +186,15 @@ impl<'a> Persist<'a> for ResourceAllocator { } #[derive(Debug, Clone, Serialize, Deserialize)] +/// State of a ResourceAllocator pub struct ResourceAllocatorState { - // Allocator for device interrupt lines + /// Allocator for device interrupt lines pub gsi_allocator: Arc>, - // Allocator for memory in the 32-bit MMIO address space + /// Allocator for memory in the 32-bit MMIO address space pub mmio32_memory: Arc>, - // Allocator for memory in the 64-bit MMIO address space + /// Allocator for memory in the 64-bit MMIO address space pub mmio64_memory: Arc>, - // Memory allocator for system data + /// Memory allocator for system data pub system_memory: Arc>, } diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index 950bcac652d..a2c3a65be6b 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -28,7 +28,6 @@ use vmm_sys_util::errno; use vmm_sys_util::eventfd::EventFd; pub use crate::arch::{ArchVm as Vm, ArchVmError, VmState}; -use crate::device_manager::resources::ResourceAllocator; use crate::logger::info; use crate::persist::CreateSnapshotError; use crate::snapshot::Persist; @@ -37,6 +36,7 @@ use crate::vmm_config::snapshot::SnapshotType; use crate::vstate::memory::{ Address, GuestMemory, GuestMemoryExtension, GuestMemoryMmap, GuestMemoryRegion, GuestRegionMmap, }; +use crate::vstate::resources::ResourceAllocator; use crate::vstate::vcpu::VcpuError; use crate::{DirtyBitmap, Vcpu, mem_size_mib}; @@ -244,6 +244,8 @@ pub struct VmCommon { pub guest_memory: GuestMemoryMmap, /// Interrupts used by Vm's devices pub interrupts: Mutex>, + /// Allocator for VM resources + pub resource_allocator: Arc, } /// Errors associated with the wrappers over KVM ioctls. @@ -265,6 +267,8 @@ pub enum VmError { NotEnoughMemorySlots, /// Memory Error: {0} VmMemory(#[from] vm_memory::Error), + /// ResourceAllocator error: {0} + ResourceAllocator(#[from] vm_allocator::Error) } /// Contains Vm functions that are usable across CPU architectures @@ -312,6 +316,7 @@ impl Vm { max_memslots: kvm.max_nr_memslots(), guest_memory: GuestMemoryMmap::default(), interrupts: Mutex::new(HashMap::new()), + resource_allocator: Arc::new(ResourceAllocator::new()?), }) } @@ -565,14 +570,12 @@ impl Vm { } /// Create a group of MSI-X interrupts - pub fn create_msix_group( - vm: Arc, - resource_allocator: &ResourceAllocator, - count: u16, - ) -> Result { + pub fn create_msix_group(vm: Arc, count: u16) -> Result { debug!("Creating new MSI group with {count} vectors"); let mut irq_routes = HashMap::with_capacity(count as usize); - for (gsi, i) in resource_allocator + for (gsi, i) in vm + .common + .resource_allocator .allocate_gsi(count as u32)? .iter() .zip(0u32..) @@ -723,8 +726,7 @@ pub(crate) mod tests { } fn create_msix_group(vm: &Arc) -> MsiVectorGroup { - let resource_allocator = ResourceAllocator::new().unwrap(); - Vm::create_msix_group(vm.clone(), &resource_allocator, 4).unwrap() + Vm::create_msix_group(vm.clone(), 4).unwrap() } #[test] From 6a9fd471ddd4c0769a7c4d3f49fb3a3b7010778e Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Fri, 13 Jun 2025 17:09:46 +0200 Subject: [PATCH 49/56] refactor(vm): move `Bus` objects to Vm We had previously added MMIO and Port IO buses inside ResourceAllocator so that we could implement DeviceRelocation for the type. Now, we will delegate device relocation responsibilities to ArchVm instead. That is because device relocation requires access to the Vm file descriptor as well. As a result, we can move buses to the Vm object itself. Add MMIO bus to VmCommon as both architectures use it. Add PortIO bus for x86 architecture only. Not that we don't still support DeviceRelocation. VirtIO devices should not request us to relocate them. Also, for adding such support we would need to also support VirtIO reset. We will look into adding this functionaliyt later on. Signed-off-by: Babis Chalios --- src/vmm/src/arch/x86_64/vm.rs | 6 +++ src/vmm/src/device_manager/legacy.rs | 2 +- src/vmm/src/device_manager/mmio.rs | 10 ++--- src/vmm/src/device_manager/mod.rs | 5 +-- src/vmm/src/device_manager/pci_mngr.rs | 24 ++++------ src/vmm/src/device_manager/persist.rs | 2 +- src/vmm/src/devices/pci/pci_segment.rs | 61 ++++++++++++-------------- src/vmm/src/lib.rs | 5 +-- src/vmm/src/vstate/resources.rs | 26 ----------- src/vmm/src/vstate/vm.rs | 20 ++++++++- 10 files changed, 73 insertions(+), 88 deletions(-) diff --git a/src/vmm/src/arch/x86_64/vm.rs b/src/vmm/src/arch/x86_64/vm.rs index 9d22bf9a757..fbc27c82a60 100644 --- a/src/vmm/src/arch/x86_64/vm.rs +++ b/src/vmm/src/arch/x86_64/vm.rs @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use std::fmt; +use std::sync::Arc; use kvm_bindings::{ KVM_CLOCK_TSC_STABLE, KVM_IRQCHIP_IOAPIC, KVM_IRQCHIP_PIC_MASTER, KVM_IRQCHIP_PIC_SLAVE, @@ -58,6 +59,8 @@ pub struct ArchVm { /// /// `None` if `KVM_CAP_XSAVE2` not supported. xsave2_size: Option, + /// Port IO bus + pub pio_bus: Arc, } impl ArchVm { @@ -92,10 +95,13 @@ impl ArchVm { .set_tss_address(u64_to_usize(crate::arch::x86_64::layout::KVM_TSS_ADDRESS)) .map_err(ArchVmError::SetTssAddress)?; + let pio_bus = Arc::new(vm_device::Bus::new()); + Ok(ArchVm { common, msrs_to_save, xsave2_size, + pio_bus, }) } diff --git a/src/vmm/src/device_manager/legacy.rs b/src/vmm/src/device_manager/legacy.rs index 47b259ef87b..d0194e24e62 100644 --- a/src/vmm/src/device_manager/legacy.rs +++ b/src/vmm/src/device_manager/legacy.rs @@ -119,7 +119,7 @@ impl PortIODeviceManager { input: None, })); - let io_bus = &vm.common.resource_allocator.pio_bus; + let io_bus = &vm.pio_bus; io_bus.insert( self.stdio_serial.clone(), Self::SERIAL_PORT_ADDRESSES[0], diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index da32cf14271..13ab13f47ea 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -202,7 +202,7 @@ impl MMIODeviceManager { .map_err(MmioError::RegisterIrqFd)?; } - vm.common.resource_allocator.mmio_bus.insert( + vm.common.mmio_bus.insert( device.inner.clone(), device.resources.addr, device.resources.len, @@ -296,7 +296,7 @@ impl MMIODeviceManager { inner: serial, }; - vm.common.resource_allocator.mmio_bus.insert( + vm.common.mmio_bus.insert( device.inner.clone(), device.resources.addr, device.resources.len, @@ -327,7 +327,7 @@ impl MMIODeviceManager { /// given as parameter, otherwise allocate a new MMIO resources for it. pub fn register_mmio_rtc( &mut self, - resource_allocator: &ResourceAllocator, + vm: &Vm, rtc: Arc>, device_info_opt: Option, ) -> Result<(), MmioError> { @@ -336,7 +336,7 @@ impl MMIODeviceManager { let device_info = if let Some(device_info) = device_info_opt { device_info } else { - let gsi = resource_allocator.allocate_gsi(1)?; + let gsi = vm.common.resource_allocator.allocate_gsi(1)?; MMIODeviceInfo { addr: RTC_MEM_START, len: MMIO_LEN, @@ -349,7 +349,7 @@ impl MMIODeviceManager { inner: rtc, }; - resource_allocator.mmio_bus.insert( + vm.common.mmio_bus.insert( device.inner.clone(), device.resources.addr, device.resources.len, diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index f037a4a8d05..c641a1aac0e 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -217,7 +217,7 @@ impl DeviceManager { let boot_timer = Arc::new(Mutex::new(BootTimer::new(request_ts))); self.mmio_devices - .register_mmio_boot_timer(&vm.common.resource_allocator.mmio_bus, boot_timer)?; + .register_mmio_boot_timer(&vm.common.mmio_bus, boot_timer)?; Ok(()) } @@ -256,8 +256,7 @@ impl DeviceManager { } let rtc = Arc::new(Mutex::new(RTCDevice::new())); - self.mmio_devices - .register_mmio_rtc(&vm.common.resource_allocator, rtc, None)?; + self.mmio_devices.register_mmio_rtc(vm, rtc, None)?; Ok(()) } diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs index 199c6ec3c7c..303a34a3448 100644 --- a/src/vmm/src/device_manager/pci_mngr.rs +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -33,7 +33,6 @@ use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET, TYPE_RNG}; use crate::resources::VmResources; use crate::snapshot::Persist; use crate::vstate::memory::GuestMemoryMmap; -use crate::vstate::resources::ResourceAllocator; use crate::vstate::vm::{InterruptError, MsiVectorGroup}; use crate::{EventManager, Vm}; @@ -75,14 +74,14 @@ impl PciDevices { // Currently we don't assign any IRQs to PCI devices. We will be using MSI-X interrupts // only. - let pci_segment = PciSegment::new(0, &vm.common.resource_allocator, &[0u8; 32])?; + let pci_segment = PciSegment::new(0, vm, &[0u8; 32])?; self.pci_segment = Some(pci_segment); Ok(()) } fn register_bars_with_bus( - resource_allocator: &ResourceAllocator, + vm: &Vm, virtio_device: &Arc>, ) -> Result<(), PciManagerError> { for bar in &virtio_device.lock().expect("Poisoned lock").bar_regions { @@ -94,11 +93,8 @@ impl PciDevices { bar.size() ); #[cfg(target_arch = "x86_64")] - resource_allocator.pio_bus.insert( - virtio_device.clone(), - bar.addr(), - bar.size(), - )?; + vm.pio_bus + .insert(virtio_device.clone(), bar.addr(), bar.size())?; #[cfg(target_arch = "aarch64")] log::error!("pci: We do not support I/O region allocation") } @@ -108,11 +104,9 @@ impl PciDevices { bar.addr(), bar.size() ); - resource_allocator.mmio_bus.insert( - virtio_device.clone(), - bar.addr(), - bar.size(), - )?; + vm.common + .mmio_bus + .insert(virtio_device.clone(), bar.addr(), bar.size())?; } } } @@ -168,7 +162,7 @@ impl PciDevices { self.virtio_devices .insert((device_type, id.clone()), virtio_device.clone()); - Self::register_bars_with_bus(resource_allocator, &virtio_device)?; + Self::register_bars_with_bus(vm, &virtio_device)?; virtio_device .lock() .expect("Poisoned lock") @@ -213,7 +207,7 @@ impl PciDevices { self.virtio_devices .insert((device_type, device_id.to_string()), virtio_device.clone()); - Self::register_bars_with_bus(&vm.common.resource_allocator, &virtio_device)?; + Self::register_bars_with_bus(vm, &virtio_device)?; virtio_device .lock() .expect("Poisoned lock") diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 24a0d3cca3d..87358181df9 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -389,7 +389,7 @@ impl<'a> Persist<'a> for MMIODeviceManager { if state.type_ == DeviceType::Rtc { let rtc = Arc::new(Mutex::new(RTCDevice::new())); dev_manager.register_mmio_rtc( - &constructor_args.vm.common.resource_allocator, + constructor_args.vm, rtc, Some(state.device_info), )?; diff --git a/src/vmm/src/devices/pci/pci_segment.rs b/src/vmm/src/devices/pci/pci_segment.rs index e957332bb0e..c37763eab3a 100644 --- a/src/vmm/src/devices/pci/pci_segment.rs +++ b/src/vmm/src/devices/pci/pci_segment.rs @@ -21,7 +21,7 @@ use uuid::Uuid; use vm_allocator::AddressAllocator; use vm_device::{BusDeviceSync, BusError}; -use crate::arch::{PCI_MMCONFIG_START, PCI_MMIO_CONFIG_SIZE_PER_SEGMENT}; +use crate::arch::{ArchVm as Vm, PCI_MMCONFIG_START, PCI_MMIO_CONFIG_SIZE_PER_SEGMENT}; use crate::vstate::resources::ResourceAllocator; pub struct PciSegment { @@ -67,28 +67,21 @@ impl std::fmt::Debug for PciSegment { } impl PciSegment { - fn build( - id: u16, - resource_allocator: &Arc, - pci_irq_slots: &[u8; 32], - ) -> Result { + fn build(id: u16, vm: &Arc, pci_irq_slots: &[u8; 32]) -> Result { let pci_root = PciRoot::new(None); - let pci_bus = Arc::new(Mutex::new(PciBus::new( - pci_root, - resource_allocator.clone(), - ))); + let pci_bus = Arc::new(Mutex::new(PciBus::new(pci_root, vm.clone()))); let pci_config_mmio = Arc::new(Mutex::new(PciConfigMmio::new(Arc::clone(&pci_bus)))); let mmio_config_address = PCI_MMCONFIG_START + PCI_MMIO_CONFIG_SIZE_PER_SEGMENT * id as u64; - resource_allocator.mmio_bus.insert( + vm.common.mmio_bus.insert( Arc::clone(&pci_config_mmio) as Arc, mmio_config_address, PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, )?; - let mem32_allocator = resource_allocator.mmio32_memory.clone(); - let mem64_allocator = resource_allocator.mmio64_memory.clone(); + let mem32_allocator = vm.common.resource_allocator.mmio32_memory.clone(); + let mem64_allocator = vm.common.resource_allocator.mmio64_memory.clone(); let start_of_mem32_area = mem32_allocator.lock().unwrap().base(); let end_of_mem32_area = mem32_allocator.lock().unwrap().end(); @@ -119,13 +112,15 @@ impl PciSegment { #[cfg(target_arch = "x86_64")] pub(crate) fn new( id: u16, - resource_allocator: &Arc, + vm: &Arc, pci_irq_slots: &[u8; 32], ) -> Result { - let mut segment = Self::build(id, resource_allocator, pci_irq_slots)?; + use crate::Vm; + + let mut segment = Self::build(id, vm, pci_irq_slots)?; let pci_config_io = Arc::new(Mutex::new(PciConfigIo::new(Arc::clone(&segment.pci_bus)))); - resource_allocator.pio_bus.insert( + vm.pio_bus.insert( pci_config_io.clone(), PCI_CONFIG_IO_PORT, PCI_CONFIG_IO_PORT_SIZE, @@ -151,10 +146,10 @@ impl PciSegment { #[cfg(target_arch = "aarch64")] pub(crate) fn new( id: u16, - resource_allocator: &Arc, + vm: &Arc, pci_irq_slots: &[u8; 32], ) -> Result { - let segment = Self::build(id, resource_allocator, pci_irq_slots)?; + let segment = Self::build(id, vm, pci_irq_slots)?; info!( "pci: adding PCI segment: id={:#x}, PCI MMIO config address: {:#x}, mem32 area: \ [{:#x}-{:#x}], mem64 area: [{:#x}-{:#x}]", @@ -468,13 +463,14 @@ mod tests { use super::*; use crate::arch; + use crate::builder::tests::default_vmm; use crate::utils::u64_to_usize; #[test] fn test_pci_segment_build() { - let resource_allocator = Arc::new(ResourceAllocator::new().unwrap()); + let vmm = default_vmm(); let pci_irq_slots = &[0u8; 32]; - let pci_segment = PciSegment::new(0, &resource_allocator, pci_irq_slots).unwrap(); + let pci_segment = PciSegment::new(0, &vmm.vm, pci_irq_slots).unwrap(); assert_eq!(pci_segment.id, 0); assert_eq!( @@ -503,17 +499,14 @@ mod tests { #[cfg(target_arch = "x86_64")] #[test] fn test_io_bus() { - let resource_allocator = Arc::new(ResourceAllocator::new().unwrap()); + let vmm = default_vmm(); let pci_irq_slots = &[0u8; 32]; - let pci_segment = PciSegment::new(0, &resource_allocator, pci_irq_slots).unwrap(); + let pci_segment = PciSegment::new(0, &vmm.vm, pci_irq_slots).unwrap(); let mut data = [0u8; u64_to_usize(PCI_CONFIG_IO_PORT_SIZE)]; - resource_allocator - .pio_bus - .read(PCI_CONFIG_IO_PORT, &mut data) - .unwrap(); + vmm.vm.pio_bus.read(PCI_CONFIG_IO_PORT, &mut data).unwrap(); - resource_allocator + vmm.vm .pio_bus .read(PCI_CONFIG_IO_PORT + PCI_CONFIG_IO_PORT_SIZE, &mut data) .unwrap_err(); @@ -521,17 +514,19 @@ mod tests { #[test] fn test_mmio_bus() { - let resource_allocator = Arc::new(ResourceAllocator::new().unwrap()); + let vmm = default_vmm(); let pci_irq_slots = &[0u8; 32]; - let pci_segment = PciSegment::new(0, &resource_allocator, pci_irq_slots).unwrap(); + let pci_segment = PciSegment::new(0, &vmm.vm, pci_irq_slots).unwrap(); let mut data = [0u8; u64_to_usize(PCI_MMIO_CONFIG_SIZE_PER_SEGMENT)]; - resource_allocator + vmm.vm + .common .mmio_bus .read(pci_segment.mmio_config_address, &mut data) .unwrap(); - resource_allocator + vmm.vm + .common .mmio_bus .read( pci_segment.mmio_config_address + PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, @@ -542,9 +537,9 @@ mod tests { #[test] fn test_next_device_bdf() { - let resource_allocator = Arc::new(ResourceAllocator::new().unwrap()); + let vmm = default_vmm(); let pci_irq_slots = &[0u8; 32]; - let pci_segment = PciSegment::new(0, &resource_allocator, pci_irq_slots).unwrap(); + let pci_segment = PciSegment::new(0, &vmm.vm, pci_irq_slots).unwrap(); // Start checking from device id 1, since 0 is allocated to the Root port. for dev_id in 1..32 { diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index d65fcdbeed5..4549c79857a 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -373,10 +373,9 @@ impl Vmm { self.vcpus_handles.reserve(vcpu_count); for mut vcpu in vcpus.drain(..) { - vcpu.set_mmio_bus(self.vm.common.resource_allocator.mmio_bus.clone()); + vcpu.set_mmio_bus(self.vm.common.mmio_bus.clone()); #[cfg(target_arch = "x86_64")] - vcpu.kvm_vcpu - .set_pio_bus(self.vm.common.resource_allocator.pio_bus.clone()); + vcpu.kvm_vcpu.set_pio_bus(self.vm.pio_bus.clone()); self.vcpus_handles .push(vcpu.start_threaded(vcpu_seccomp_filter.clone(), barrier.clone())?); diff --git a/src/vmm/src/vstate/resources.rs b/src/vmm/src/vstate/resources.rs index 3b77b892bc3..3d8d8016e97 100644 --- a/src/vmm/src/vstate/resources.rs +++ b/src/vmm/src/vstate/resources.rs @@ -4,11 +4,9 @@ use std::convert::Infallible; use std::sync::{Arc, Mutex}; -use pci::DeviceRelocation; use serde::{Deserialize, Serialize}; pub use vm_allocator::AllocPolicy; use vm_allocator::{AddressAllocator, IdAllocator}; -use vm_device::Bus; use crate::arch; use crate::snapshot::Persist; @@ -30,11 +28,6 @@ pub struct ResourceAllocator { pub mmio64_memory: Arc>, /// Memory allocator for system data pub system_memory: Arc>, - /// MMIO bus - pub mmio_bus: Arc, - #[cfg(target_arch = "x86_64")] - /// Port IO bus - pub pio_bus: Arc, } impl ResourceAllocator { @@ -54,9 +47,6 @@ impl ResourceAllocator { arch::SYSTEM_MEM_START, arch::SYSTEM_MEM_SIZE, )?)), - mmio_bus: Arc::new(Bus::new()), - #[cfg(target_arch = "x86_64")] - pio_bus: Arc::new(Bus::new()), }) } @@ -178,9 +168,6 @@ impl<'a> Persist<'a> for ResourceAllocator { mmio32_memory: state.mmio32_memory.clone(), mmio64_memory: state.mmio64_memory.clone(), system_memory: state.system_memory.clone(), - mmio_bus: Arc::new(Bus::new()), - #[cfg(target_arch = "x86_64")] - pio_bus: Arc::new(Bus::new()), }) } } @@ -219,19 +206,6 @@ impl Default for ResourceAllocatorState { } } -impl DeviceRelocation for ResourceAllocator { - fn move_bar( - &self, - _old_base: u64, - _new_base: u64, - _len: u64, - _pci_dev: &mut dyn pci::PciDevice, - _region_type: pci::PciBarRegionType, - ) -> Result<(), std::io::Error> { - todo!() - } -} - #[cfg(test)] mod tests { use vm_allocator::AllocPolicy; diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index a2c3a65be6b..6bdfad5e37b 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -19,7 +19,8 @@ use kvm_bindings::{ KvmIrqRouting, kvm_irq_routing_entry, kvm_userspace_memory_region, }; use kvm_ioctls::VmFd; -use log::debug; +use log::{debug, error}; +use pci::DeviceRelocation; use serde::{Deserialize, Serialize}; use vm_device::interrupt::{ InterruptIndex, InterruptSourceConfig, InterruptSourceGroup, MsiIrqSourceConfig, @@ -246,6 +247,8 @@ pub struct VmCommon { pub interrupts: Mutex>, /// Allocator for VM resources pub resource_allocator: Arc, + /// MMIO bus + pub mmio_bus: Arc, } /// Errors associated with the wrappers over KVM ioctls. @@ -317,6 +320,7 @@ impl Vm { guest_memory: GuestMemoryMmap::default(), interrupts: Mutex::new(HashMap::new()), resource_allocator: Arc::new(ResourceAllocator::new()?), + mmio_bus: Arc::new(vm_device::Bus::new()), }) } @@ -603,6 +607,20 @@ impl Vm { } } +impl DeviceRelocation for Vm { + fn move_bar( + &self, + _old_base: u64, + _new_base: u64, + _len: u64, + _pci_dev: &mut dyn pci::PciDevice, + _region_type: pci::PciBarRegionType, + ) -> Result<(), std::io::Error> { + error!("pci: device relocation not supported"); + Err(std::io::Error::from(std::io::ErrorKind::Unsupported)) + } +} + #[cfg(test)] pub(crate) mod tests { use vm_device::interrupt::{InterruptSourceConfig, LegacyIrqSourceConfig}; From ad926da6002b61410b007f598e13a2a263234b80 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 18 Jun 2025 23:39:14 +0200 Subject: [PATCH 50/56] arm: support MSI-X on ARM Add support for ITS device which provides support for MSI interrupts on ARM architecture. This is currently supported only on systems with GICv3 interrupt controller. In order to make saving/restore of ITS state work properly, we need to change the order in which we restore redistributor register GICR_CTLR. We need to make sure that this register is restored last. Otherwise, restoring GICR_PROPBASER doesn't have any effect and ITS depends on it in order to save/restore ITS tables to/from guest memory. Signed-off-by: Babis Chalios --- src/vmm/src/arch/aarch64/fdt.rs | 30 ++++ src/vmm/src/arch/aarch64/gic/gicv2/mod.rs | 4 +- .../src/arch/aarch64/gic/gicv2/regs/mod.rs | 1 + src/vmm/src/arch/aarch64/gic/gicv3/mod.rs | 88 +++++++++--- .../arch/aarch64/gic/gicv3/regs/its_regs.rs | 135 ++++++++++++++++++ .../src/arch/aarch64/gic/gicv3/regs/mod.rs | 48 +++++-- .../aarch64/gic/gicv3/regs/redist_regs.rs | 2 +- src/vmm/src/arch/aarch64/gic/mod.rs | 22 +++ src/vmm/src/arch/aarch64/gic/regs.rs | 3 + src/vmm/src/arch/aarch64/output_GICv3.dtb | Bin 2097152 -> 2097152 bytes .../src/arch/aarch64/output_initrd_GICv3.dtb | Bin 2097152 -> 2097152 bytes 11 files changed, 298 insertions(+), 35 deletions(-) create mode 100644 src/vmm/src/arch/aarch64/gic/gicv3/regs/its_regs.rs diff --git a/src/vmm/src/arch/aarch64/fdt.rs b/src/vmm/src/arch/aarch64/fdt.rs index 0073d7dbc05..a4cf14b52d7 100644 --- a/src/vmm/src/arch/aarch64/fdt.rs +++ b/src/vmm/src/arch/aarch64/fdt.rs @@ -28,6 +28,8 @@ use crate::vstate::memory::{Address, GuestMemory, GuestMemoryMmap}; const GIC_PHANDLE: u32 = 1; // This is a value for uniquely identifying the FDT node containing the clock definition. const CLOCK_PHANDLE: u32 = 2; +// This is a value for uniquely identifying the FDT node declaring the MSI controller. +const MSI_PHANDLE: u32 = 3; // You may be wondering why this big value? // This phandle is used to uniquely identify the FDT nodes containing cache information. Each cpu // can have a variable number of caches, some of these caches may be shared with other cpus. @@ -302,6 +304,16 @@ fn create_gic_node(fdt: &mut FdtWriter, gic_device: &GICDevice) -> Result<(), Fd ]; fdt.property_array_u32("interrupts", &gic_intr)?; + + if let Some(msi_properties) = gic_device.msi_properties() { + let msic_node = fdt.begin_node("msic")?; + fdt.property_string("compatible", "arm,gic-v3-its")?; + fdt.property_null("msi-controller")?; + fdt.property_u32("phandle", MSI_PHANDLE)?; + fdt.property_array_u64("reg", msi_properties)?; + fdt.end_node(msic_node)?; + } + fdt.end_node(interrupt)?; Ok(()) @@ -471,6 +483,21 @@ fn create_pci_nodes(fdt: &mut FdtWriter, pci_devices: &PciDevices) -> Result<(), (MEM_64BIT_DEVICES_SIZE >> 32) as u32, // Range size ((MEM_64BIT_DEVICES_SIZE & 0xffff_ffff) >> 32) as u32, ]; + + // See kernel document Documentation/devicetree/bindings/pci/pci-msi.txt + let msi_map = [ + // rid-base: A single cell describing the first RID matched by the entry. + 0x0, + // msi-controller: A single phandle to an MSI controller. + MSI_PHANDLE, + // msi-base: An msi-specifier describing the msi-specifier produced for the + // first RID matched by the entry. + segment.id as u32, + // length: A single cell describing how many consecutive RIDs are matched + // following the rid-base. + 0x100, + ]; + let pci_node = fdt.begin_node(&pci_node_name)?; fdt.property_string("compatible", "pci-host-ecam-generic")?; @@ -491,6 +518,9 @@ fn create_pci_nodes(fdt: &mut FdtWriter, pci_devices: &PciDevices) -> Result<(), fdt.property_null("interrupt-map")?; fdt.property_null("interrupt-map-mask")?; fdt.property_null("dma-coherent")?; + fdt.property_array_u32("msi-map", &msi_map)?; + fdt.property_u32("msi-parent", MSI_PHANDLE)?; + Ok(fdt.end_node(pci_node)?) } diff --git a/src/vmm/src/arch/aarch64/gic/gicv2/mod.rs b/src/vmm/src/arch/aarch64/gic/gicv2/mod.rs index c4b9208a0a6..dfa2302d6be 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv2/mod.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv2/mod.rs @@ -68,7 +68,9 @@ impl GICv2 { GICv2::get_cpu_addr(), GICv2::get_cpu_size(), ], + msi_properties: None, vcpu_count, + its_device: None, }) } @@ -82,7 +84,7 @@ impl GICv2 { pub fn init_device_attributes(gic_device: &Self) -> Result<(), GicError> { // Setting up the distributor attribute. - // We are placing the GIC below 1GB so we need to substract the size of the distributor. + // We are placing the GIC below 1GB so we need to subtract the size of the distributor. Self::set_device_attribute( gic_device.device_fd(), kvm_bindings::KVM_DEV_ARM_VGIC_GRP_ADDR, diff --git a/src/vmm/src/arch/aarch64/gic/gicv2/regs/mod.rs b/src/vmm/src/arch/aarch64/gic/gicv2/regs/mod.rs index 8bb26ce2bcd..2b617716fe2 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv2/regs/mod.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv2/regs/mod.rs @@ -22,6 +22,7 @@ pub fn save_state(fd: &DeviceFd, mpidrs: &[u64]) -> Result { Ok(GicState { dist: dist_regs::get_dist_regs(fd)?, gic_vcpu_states: vcpu_states, + ..Default::default() }) } diff --git a/src/vmm/src/arch/aarch64/gic/gicv3/mod.rs b/src/vmm/src/arch/aarch64/gic/gicv3/mod.rs index 39c4e5ce148..075687bc23e 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv3/mod.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv3/mod.rs @@ -1,7 +1,7 @@ // Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 -mod regs; +pub mod regs; use kvm_ioctls::{DeviceFd, VmFd}; @@ -18,12 +18,19 @@ impl std::ops::Deref for GICv3 { } } +impl std::ops::DerefMut for GICv3 { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + impl GICv3 { // Unfortunately bindgen omits defines that are based on other defines. // See arch/arm64/include/uapi/asm/kvm.h file from the linux kernel. const SZ_64K: u64 = 0x0001_0000; const KVM_VGIC_V3_DIST_SIZE: u64 = GICv3::SZ_64K; const KVM_VGIC_V3_REDIST_SIZE: u64 = (2 * GICv3::SZ_64K); + const GIC_V3_ITS_SIZE: u64 = 0x2_0000; // Device trees specific constants const ARCH_GIC_V3_MAINT_IRQ: u32 = 9; @@ -48,6 +55,16 @@ impl GICv3 { vcpu_count * GICv3::KVM_VGIC_V3_REDIST_SIZE } + /// Get the MSI address + fn get_msi_address(vcpu_count: u64) -> u64 { + Self::get_redists_addr(vcpu_count) - GICv3::GIC_V3_ITS_SIZE + } + + /// Get the MSI size + const fn get_msi_size() -> u64 { + GICv3::GIC_V3_ITS_SIZE + } + pub const VERSION: u32 = kvm_bindings::kvm_device_type_KVM_DEV_TYPE_ARM_VGIC_V3; pub fn fdt_compatibility(&self) -> &str { @@ -59,30 +76,43 @@ impl GICv3 { } /// Create the GIC device object - pub fn create_device(fd: DeviceFd, vcpu_count: u64) -> Self { - GICv3(super::GIC { - fd, + pub fn create_device(vm: &VmFd, vcpu_count: u64) -> Result { + // Create the GIC device + let mut gic_device = kvm_bindings::kvm_create_device { + type_: Self::VERSION, + fd: 0, + flags: 0, + }; + + let gic_fd = vm + .create_device(&mut gic_device) + .map_err(GicError::CreateGIC)?; + + Ok(GICv3(super::GIC { + fd: gic_fd, properties: [ GICv3::get_dist_addr(), GICv3::get_dist_size(), GICv3::get_redists_addr(vcpu_count), GICv3::get_redists_size(vcpu_count), ], + msi_properties: Some([GICv3::get_msi_address(vcpu_count), GICv3::get_msi_size()]), vcpu_count, - }) + its_device: None, + })) } pub fn save_device(&self, mpidrs: &[u64]) -> Result { - regs::save_state(&self.fd, mpidrs) + regs::save_state(&self.fd, self.its_device.as_ref().unwrap(), mpidrs) } pub fn restore_device(&self, mpidrs: &[u64], state: &GicState) -> Result<(), GicError> { - regs::restore_state(&self.fd, mpidrs, state) + regs::restore_state(&self.fd, self.its_device.as_ref().unwrap(), mpidrs, state) } pub fn init_device_attributes(gic_device: &Self) -> Result<(), GicError> { // Setting up the distributor attribute. - // We are placing the GIC below 1GB so we need to substract the size of the distributor. + // We are placing the GIC below 1GB so we need to subtract the size of the distributor. Self::set_device_attribute( gic_device.device_fd(), kvm_bindings::KVM_DEV_ARM_VGIC_GRP_ADDR, @@ -104,25 +134,45 @@ impl GICv3 { Ok(()) } - /// Initialize a GIC device - pub fn init_device(vm: &VmFd) -> Result { - let mut gic_device = kvm_bindings::kvm_create_device { - type_: Self::VERSION, + fn init_its(vm: &VmFd, gic_device: &mut Self) -> Result<(), GicError> { + // ITS part attributes + let mut its_device = kvm_bindings::kvm_create_device { + type_: kvm_bindings::kvm_device_type_KVM_DEV_TYPE_ARM_VGIC_ITS, fd: 0, flags: 0, }; - vm.create_device(&mut gic_device) - .map_err(GicError::CreateGIC) + let its_fd = vm + .create_device(&mut its_device) + .map_err(GicError::CreateGIC)?; + + // Setting up the ITS attributes + Self::set_device_attribute( + &its_fd, + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_ADDR, + u64::from(kvm_bindings::KVM_VGIC_ITS_ADDR_TYPE), + &Self::get_msi_address(gic_device.vcpu_count()) as *const u64 as u64, + 0, + )?; + + Self::set_device_attribute( + &its_fd, + kvm_bindings::KVM_DEV_ARM_VGIC_GRP_CTRL, + u64::from(kvm_bindings::KVM_DEV_ARM_VGIC_CTRL_INIT), + 0, + 0, + )?; + + gic_device.its_device = Some(its_fd); + Ok(()) } /// Method to initialize the GIC device pub fn create(vm: &VmFd, vcpu_count: u64) -> Result { - let vgic_fd = Self::init_device(vm)?; - - let device = Self::create_device(vgic_fd, vcpu_count); + let mut device = Self::create_device(vm, vcpu_count)?; Self::init_device_attributes(&device)?; + Self::init_its(vm, &mut device)?; Self::finalize_device(&device)?; @@ -184,14 +234,14 @@ impl GICv3 { /// RDIST pending tables into guest RAM. /// /// The tables get flushed to guest RAM whenever the VM gets stopped. -fn save_pending_tables(fd: &DeviceFd) -> Result<(), GicError> { +fn save_pending_tables(gic_device: &DeviceFd) -> Result<(), GicError> { let init_gic_attr = kvm_bindings::kvm_device_attr { group: kvm_bindings::KVM_DEV_ARM_VGIC_GRP_CTRL, attr: u64::from(kvm_bindings::KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES), addr: 0, flags: 0, }; - fd.set_device_attr(&init_gic_attr).map_err(|err| { + gic_device.set_device_attr(&init_gic_attr).map_err(|err| { GicError::DeviceAttribute(err, true, kvm_bindings::KVM_DEV_ARM_VGIC_GRP_CTRL) }) } diff --git a/src/vmm/src/arch/aarch64/gic/gicv3/regs/its_regs.rs b/src/vmm/src/arch/aarch64/gic/gicv3/regs/its_regs.rs new file mode 100644 index 00000000000..ee4ecafba1e --- /dev/null +++ b/src/vmm/src/arch/aarch64/gic/gicv3/regs/its_regs.rs @@ -0,0 +1,135 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use kvm_bindings::{ + KVM_DEV_ARM_ITS_RESTORE_TABLES, KVM_DEV_ARM_ITS_SAVE_TABLES, KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_VGIC_GRP_ITS_REGS, +}; +use kvm_ioctls::DeviceFd; +use serde::{Deserialize, Serialize}; + +use crate::arch::aarch64::gic::GicError; + +// ITS registers that we want to preserve across snapshots +const GITS_CTLR: u32 = 0x0000; +const GITS_IIDR: u32 = 0x0004; +const GITS_CBASER: u32 = 0x0080; +const GITS_CWRITER: u32 = 0x0088; +const GITS_CREADR: u32 = 0x0090; +const GITS_BASER: u32 = 0x0100; + +fn set_device_attribute( + its_device: &DeviceFd, + group: u32, + attr: u32, + val: u64, +) -> Result<(), GicError> { + let gicv3_its_attr = kvm_bindings::kvm_device_attr { + group, + attr: attr as u64, + addr: &val as *const u64 as u64, + flags: 0, + }; + + its_device + .set_device_attr(&gicv3_its_attr) + .map_err(|err| GicError::DeviceAttribute(err, true, group)) +} + +fn get_device_attribute(its_device: &DeviceFd, group: u32, attr: u32) -> Result { + let mut val = 0; + + let mut gicv3_its_attr = kvm_bindings::kvm_device_attr { + group, + attr: attr as u64, + addr: &mut val as *mut u64 as u64, + flags: 0, + }; + + // SAFETY: gicv3_its_attr.addr is safe to write to. + unsafe { its_device.get_device_attr(&mut gicv3_its_attr) } + .map_err(|err| GicError::DeviceAttribute(err, false, group))?; + + Ok(val) +} + +fn its_read_register(its_fd: &DeviceFd, attr: u32) -> Result { + get_device_attribute(its_fd, KVM_DEV_ARM_VGIC_GRP_ITS_REGS, attr) +} + +fn its_set_register(its_fd: &DeviceFd, attr: u32, val: u64) -> Result<(), GicError> { + set_device_attribute(its_fd, KVM_DEV_ARM_VGIC_GRP_ITS_REGS, attr, val) +} + +pub fn its_save_tables(its_fd: &DeviceFd) -> Result<(), GicError> { + set_device_attribute( + its_fd, + KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_ITS_SAVE_TABLES, + 0, + ) +} + +pub fn its_restore_tables(its_fd: &DeviceFd) -> Result<(), GicError> { + set_device_attribute( + its_fd, + KVM_DEV_ARM_VGIC_GRP_CTRL, + KVM_DEV_ARM_ITS_RESTORE_TABLES, + 0, + ) +} + +/// ITS registers that we save/restore during snapshot +#[derive(Debug, Default, Serialize, Deserialize)] +pub struct ItsRegisterState { + iidr: u64, + cbaser: u64, + creadr: u64, + cwriter: u64, + baser: [u64; 8], + ctlr: u64, +} + +impl ItsRegisterState { + /// Save ITS state + pub fn save(its_fd: &DeviceFd) -> Result { + let mut state = ItsRegisterState::default(); + + for i in 0..8 { + state.baser[i as usize] = its_read_register(its_fd, GITS_BASER + i * 8)?; + } + state.ctlr = its_read_register(its_fd, GITS_CTLR)?; + state.cbaser = its_read_register(its_fd, GITS_CBASER)?; + state.creadr = its_read_register(its_fd, GITS_CREADR)?; + state.cwriter = its_read_register(its_fd, GITS_CWRITER)?; + state.iidr = its_read_register(its_fd, GITS_IIDR)?; + + Ok(state) + } + + /// Restore ITS state + /// + /// We need to restore ITS registers in a very specific order for things to work. Take a look + /// at: + /// https://elixir.bootlin.com/linux/v6.1.141/source/Documentation/virt/kvm/devices/arm-vgic-its.rst#L60 + /// and + /// https://elixir.bootlin.com/linux/v6.1.141/source/Documentation/virt/kvm/devices/arm-vgic-its.rst#L123 + /// + /// for more details, but TL;DR is: + /// + /// We need to restore GITS_CBASER, GITS_CREADER, GITS_CWRITER, GITS_BASER and GITS_IIDR + /// registers before restoring ITS tables from guest memory. We also need to set GITS_CTLR + /// last. + pub fn restore(&self, its_fd: &DeviceFd) -> Result<(), GicError> { + its_set_register(its_fd, GITS_IIDR, self.iidr)?; + its_set_register(its_fd, GITS_CBASER, self.cbaser)?; + its_set_register(its_fd, GITS_CREADR, self.creadr)?; + its_set_register(its_fd, GITS_CWRITER, self.cwriter)?; + for i in 0..8 { + its_set_register(its_fd, GITS_BASER + i * 8, self.baser[i as usize])?; + } + // We need to restore saved ITS tables before restoring GITS_CTLR + its_restore_tables(its_fd)?; + its_set_register(its_fd, GITS_CTLR, self.ctlr) + } +} diff --git a/src/vmm/src/arch/aarch64/gic/gicv3/regs/mod.rs b/src/vmm/src/arch/aarch64/gic/gicv3/regs/mod.rs index 0531766dc54..3df0d4642d7 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv3/regs/mod.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv3/regs/mod.rs @@ -3,45 +3,63 @@ mod dist_regs; mod icc_regs; +pub mod its_regs; mod redist_regs; +use its_regs::{ItsRegisterState, its_save_tables}; use kvm_ioctls::DeviceFd; use crate::arch::aarch64::gic::GicError; use crate::arch::aarch64::gic::regs::{GicState, GicVcpuState}; /// Save the state of the GIC device. -pub fn save_state(fd: &DeviceFd, mpidrs: &[u64]) -> Result { +pub fn save_state( + gic_device: &DeviceFd, + its_device: &DeviceFd, + mpidrs: &[u64], +) -> Result { // Flush redistributors pending tables to guest RAM. - super::save_pending_tables(fd)?; + super::save_pending_tables(gic_device)?; + // Flush ITS tables into guest memory. + its_save_tables(its_device)?; let mut vcpu_states = Vec::with_capacity(mpidrs.len()); for mpidr in mpidrs { vcpu_states.push(GicVcpuState { - rdist: redist_regs::get_redist_regs(fd, *mpidr)?, - icc: icc_regs::get_icc_regs(fd, *mpidr)?, + rdist: redist_regs::get_redist_regs(gic_device, *mpidr)?, + icc: icc_regs::get_icc_regs(gic_device, *mpidr)?, }) } + let its_state = ItsRegisterState::save(its_device)?; + Ok(GicState { - dist: dist_regs::get_dist_regs(fd)?, + dist: dist_regs::get_dist_regs(gic_device)?, gic_vcpu_states: vcpu_states, + its_state: Some(its_state), }) } /// Restore the state of the GIC device. -pub fn restore_state(fd: &DeviceFd, mpidrs: &[u64], state: &GicState) -> Result<(), GicError> { - dist_regs::set_dist_regs(fd, &state.dist)?; +pub fn restore_state( + gic_device: &DeviceFd, + its_device: &DeviceFd, + mpidrs: &[u64], + state: &GicState, +) -> Result<(), GicError> { + dist_regs::set_dist_regs(gic_device, &state.dist)?; if mpidrs.len() != state.gic_vcpu_states.len() { return Err(GicError::InconsistentVcpuCount); } for (mpidr, vcpu_state) in mpidrs.iter().zip(&state.gic_vcpu_states) { - redist_regs::set_redist_regs(fd, *mpidr, &vcpu_state.rdist)?; - icc_regs::set_icc_regs(fd, *mpidr, &vcpu_state.icc)?; + redist_regs::set_redist_regs(gic_device, *mpidr, &vcpu_state.rdist)?; + icc_regs::set_icc_regs(gic_device, *mpidr, &vcpu_state.icc)?; } - Ok(()) + // Safe to unwrap here, as we know we support an ITS device, so `its_state.is_some()` is always + // `true`. + state.its_state.as_ref().unwrap().restore(its_device) } #[cfg(test)] @@ -59,9 +77,10 @@ mod tests { let vm = kvm.create_vm().unwrap(); let gic = create_gic(&vm, 1, Some(GICVersion::GICV3)).expect("Cannot create gic"); let gic_fd = gic.device_fd(); + let its_fd = gic.its_fd().unwrap(); let mpidr = vec![1]; - let res = save_state(gic_fd, &mpidr); + let res = save_state(gic_fd, its_fd, &mpidr); // We will receive an error if trying to call before creating vcpu. assert_eq!( format!("{:?}", res.unwrap_err()), @@ -73,8 +92,9 @@ mod tests { let _vcpu = vm.create_vcpu(0).unwrap(); let gic = create_gic(&vm, 1, Some(GICVersion::GICV3)).expect("Cannot create gic"); let gic_fd = gic.device_fd(); + let its_fd = gic.its_fd().unwrap(); - let vm_state = save_state(gic_fd, &mpidr).unwrap(); + let vm_state = save_state(gic_fd, its_fd, &mpidr).unwrap(); let val: u32 = 0; let gicd_statusr_off = 0x0010u64; let mut gic_dist_attr = kvm_bindings::kvm_device_attr { @@ -94,7 +114,7 @@ mod tests { assert_eq!(gicd_statusr.chunks[0], val); assert_eq!(vm_state.dist.len(), 12); - restore_state(gic_fd, &mpidr, &vm_state).unwrap(); - restore_state(gic_fd, &[1, 2], &vm_state).unwrap_err(); + restore_state(gic_fd, its_fd, &mpidr, &vm_state).unwrap(); + restore_state(gic_fd, its_fd, &[1, 2], &vm_state).unwrap_err(); } } diff --git a/src/vmm/src/arch/aarch64/gic/gicv3/regs/redist_regs.rs b/src/vmm/src/arch/aarch64/gic/gicv3/regs/redist_regs.rs index 4d1ba3292c1..96aaebc87bd 100644 --- a/src/vmm/src/arch/aarch64/gic/gicv3/regs/redist_regs.rs +++ b/src/vmm/src/arch/aarch64/gic/gicv3/regs/redist_regs.rs @@ -28,11 +28,11 @@ const GICR_ICFGR0: SimpleReg = SimpleReg::new(GICR_SGI_OFFSET + 0x0C00, 8); // List with relevant redistributor registers that we will be restoring. static VGIC_RDIST_REGS: &[SimpleReg] = &[ - GICR_CTLR, GICR_STATUSR, GICR_WAKER, GICR_PROPBASER, GICR_PENDBASER, + GICR_CTLR, ]; // List with relevant SGI associated redistributor registers that we will be restoring. diff --git a/src/vmm/src/arch/aarch64/gic/mod.rs b/src/vmm/src/arch/aarch64/gic/mod.rs index cda423f478c..9bfabee1fea 100644 --- a/src/vmm/src/arch/aarch64/gic/mod.rs +++ b/src/vmm/src/arch/aarch64/gic/mod.rs @@ -21,8 +21,14 @@ pub struct GIC { /// GIC device properties, to be used for setting up the fdt entry properties: [u64; 4], + /// MSI properties of the GIC device + msi_properties: Option<[u64; 2]>, + /// Number of CPUs handled by the device vcpu_count: u64, + + /// ITS device + its_device: Option, } impl GIC { /// Returns the file descriptor of the GIC device @@ -80,6 +86,14 @@ impl GICDevice { } } + /// Returns the file descriptor of the ITS device, if any + pub fn its_fd(&self) -> Option<&DeviceFd> { + match self { + Self::V2(_) => None, + Self::V3(x) => x.its_device.as_ref(), + } + } + /// Returns an array with GIC device properties pub fn device_properties(&self) -> &[u64] { match self { @@ -88,6 +102,14 @@ impl GICDevice { } } + /// Returns an array with MSI properties if GIC supports it + pub fn msi_properties(&self) -> Option<&[u64; 2]> { + match self { + Self::V2(x) => x.msi_properties.as_ref(), + Self::V3(x) => x.msi_properties.as_ref(), + } + } + /// Returns the number of vCPUs this GIC handles pub fn vcpu_count(&self) -> u64 { match self { diff --git a/src/vmm/src/arch/aarch64/gic/regs.rs b/src/vmm/src/arch/aarch64/gic/regs.rs index 60987cc973d..1afa7acde9c 100644 --- a/src/vmm/src/arch/aarch64/gic/regs.rs +++ b/src/vmm/src/arch/aarch64/gic/regs.rs @@ -10,6 +10,7 @@ use kvm_ioctls::DeviceFd; use serde::{Deserialize, Serialize}; use crate::arch::aarch64::gic::GicError; +use crate::arch::aarch64::gic::gicv3::regs::its_regs::ItsRegisterState; #[derive(Debug, Serialize, Deserialize)] pub struct GicRegState { @@ -30,6 +31,8 @@ pub struct GicState { pub dist: Vec>, /// The state of the vcpu interfaces. pub gic_vcpu_states: Vec, + /// The state of the ITS device. Only present with GICv3. + pub its_state: Option, } /// Structure used for serializing the state of the GIC registers for a specific vCPU. diff --git a/src/vmm/src/arch/aarch64/output_GICv3.dtb b/src/vmm/src/arch/aarch64/output_GICv3.dtb index 03fba87f4fedcb57536d5219315cbe6474adc7b9..35f4e9b63a35caa91b793f37e857fe3ae1c3f3aa 100644 GIT binary patch delta 357 zcmYMuy-EW?5Ww->yIVDCl1t)O{5VOMB3B72SO`|O;u9Q)%0WKhE?8MCSmXir0ek@) zLGT&USy*Umt=KE5|49S~emk=>%(B*6ZP9)#f4?hb5~b9>Qo%8~jT%A_x=BBDEz`I= zxbhO+E3$MtQ9ijUb&~fw7g-u#?!?)!m`r6RF^1Fu&i`t;%j+jlhhK6M-t=_$-kyGT zZ(h-Qopk>9KXiXrTC!ilkF)-%uDd7yrNX!3bkHB>gKqaMcelaOb!0}7(eU6Sfb?cC oi#e1qj|D8Ej0&mpamDL6FH?Hyqx%cf!So?tb`a45xjxq1YuB_ArGB|tOs0pfWAT&Zd|zX z8XmwaxEGD$-xIOm>nf_6qOvTL49(Zl&%09QNh#G<%D*KyQ9~7koY8OFI}<#gPQ9-5 zM{zWtC{3=ox|{ZoZsI7M9Eb6EluzBv1>5s~KKc5ub~CsA66*ZRy@WS^?VEQZGh5f` zp%n7g{{jhQ&~h!epvY+r`=~q8vNQc%=B_Xr4#w%Q*SkvXhktROn1{p&JZSiEy#f}n nh$5D-j1`nnMg>*Wu!;b6tYIA+*u)mLv4dUgVgDsD2c6GfW|v3F delta 249 zcmXBKy9ok86ouhC&Wz9X{eFXqMn)o{kywHaSb!K9nJg@9U;)#D4Qym;2?n-dEC`DK z;)RdHIou!!RDj7n|9y(srHCvgi7UIFi@@K{RQ2 zH0m&qr?_Y6HRqYF8oQnOxV>G6zCHU^Aps48cuOLMG&0B{2M>7^P(%r3R8U0?bu`dK U3vG1JMGt)puD%_HZ+EksA4L*6$N&HU From a943f69b2fc91dbd0f18fa29733fbefa788412a0 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Tue, 17 Jun 2025 13:13:26 +0200 Subject: [PATCH 51/56] test: VirtIO PCI device create and restoration Refactor the test code that inserts VirtIO devices in a Vmm object and then add a test which creates a Vmm with PCI devices and then serializes and deserializes the device manager and ensures that everything is as restored as expected. Signed-off-by: Babis Chalios --- src/vmm/src/builder.rs | 12 -- src/vmm/src/device_manager/mod.rs | 30 ++++ src/vmm/src/device_manager/pci_mngr.rs | 184 +++++++++++++++++++++++++ 3 files changed, 214 insertions(+), 12 deletions(-) diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 4b998fdf138..e196ef505c2 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -848,7 +848,6 @@ pub(crate) mod tests { assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_VSOCK, &vsock_dev_id) .is_some() ); @@ -874,7 +873,6 @@ pub(crate) mod tests { assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_RNG, ENTROPY_DEV_ID) .is_some() ); @@ -909,7 +907,6 @@ pub(crate) mod tests { assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BALLOON, BALLOON_DEV_ID) .is_some() ); @@ -961,7 +958,6 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=/dev/vda ro")); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); @@ -983,7 +979,6 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=PARTUUID=0eaa91a0-01 rw")); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); @@ -1006,7 +1001,6 @@ pub(crate) mod tests { assert!(!cmdline_contains(&cmdline, "root=/dev/vda")); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); @@ -1044,19 +1038,16 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=PARTUUID=0eaa91a0-01 rw")); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BLOCK, "root") .is_some() ); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BLOCK, "secondary") .is_some() ); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BLOCK, "third") .is_some() ); @@ -1086,7 +1077,6 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=/dev/vda rw")); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); @@ -1108,7 +1098,6 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=PARTUUID=0eaa91a0-01 ro")); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); @@ -1130,7 +1119,6 @@ pub(crate) mod tests { assert!(cmdline_contains(&cmdline, "root=/dev/vda rw")); assert!( vmm.device_manager - .mmio_devices .get_virtio_device(TYPE_BLOCK, drive_id.as_str()) .is_some() ); diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index c641a1aac0e..2135711ea54 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -375,6 +375,36 @@ impl DeviceManager { Self::do_mark_virtio_queue_memory_dirty(virtio_device, mem); } } + + /// Get a VirtIO device of type `virtio_type` with ID `device_id` + pub fn get_virtio_device( + &self, + virtio_type: u32, + device_id: &str, + ) -> Option>> { + if self.pci_devices.pci_segment.is_some() { + let pci_device = self.pci_devices.get_virtio_device(virtio_type, device_id)?; + Some( + pci_device + .lock() + .expect("Poisoned lock") + .virtio_device() + .clone(), + ) + } else { + let mmio_device = self + .mmio_devices + .get_virtio_device(virtio_type, device_id)?; + Some( + mmio_device + .inner + .lock() + .expect("Poisoned lock") + .device() + .clone(), + ) + } + } } #[derive(Debug, Default, Clone, Serialize, Deserialize)] diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs index 303a34a3448..5c09085e84d 100644 --- a/src/vmm/src/device_manager/pci_mngr.rs +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -585,3 +585,187 @@ impl<'a> Persist<'a> for PciDevices { Ok(pci_devices) } } + +#[cfg(test)] +mod tests { + use vmm_sys_util::tempfile::TempFile; + + use super::*; + use crate::builder::tests::*; + use crate::device_manager; + use crate::devices::virtio::block::CacheType; + use crate::mmds::data_store::MmdsVersion; + use crate::resources::VmmConfig; + use crate::snapshot::Snapshot; + use crate::vmm_config::balloon::BalloonDeviceConfig; + use crate::vmm_config::entropy::EntropyDeviceConfig; + use crate::vmm_config::net::NetworkInterfaceConfig; + use crate::vmm_config::vsock::VsockDeviceConfig; + + #[test] + fn test_device_manager_persistence() { + let mut buf = vec![0; 65536]; + // These need to survive so the restored blocks find them. + let _block_files; + let mut tmp_sock_file = TempFile::new().unwrap(); + tmp_sock_file.remove().unwrap(); + // Set up a vmm with one of each device, and get the serialized DeviceStates. + { + let mut event_manager = EventManager::new().expect("Unable to create EventManager"); + let mut vmm = default_vmm(); + vmm.device_manager.enable_pci(&vmm.vm).unwrap(); + let mut cmdline = default_kernel_cmdline(); + + // Add a balloon device. + let balloon_cfg = BalloonDeviceConfig { + amount_mib: 123, + deflate_on_oom: false, + stats_polling_interval_s: 1, + }; + insert_balloon_device(&mut vmm, &mut cmdline, &mut event_manager, balloon_cfg); + // Add a block device. + let drive_id = String::from("root"); + let block_configs = vec![CustomBlockConfig::new( + drive_id, + true, + None, + true, + CacheType::Unsafe, + )]; + _block_files = + insert_block_devices(&mut vmm, &mut cmdline, &mut event_manager, block_configs); + // Add a net device. + let network_interface = NetworkInterfaceConfig { + iface_id: String::from("netif"), + host_dev_name: String::from("hostname"), + guest_mac: None, + rx_rate_limiter: None, + tx_rate_limiter: None, + }; + insert_net_device_with_mmds( + &mut vmm, + &mut cmdline, + &mut event_manager, + network_interface, + MmdsVersion::V2, + ); + // Add a vsock device. + let vsock_dev_id = "vsock"; + let vsock_config = VsockDeviceConfig { + vsock_id: Some(vsock_dev_id.to_string()), + guest_cid: 3, + uds_path: tmp_sock_file.as_path().to_str().unwrap().to_string(), + }; + insert_vsock_device(&mut vmm, &mut cmdline, &mut event_manager, vsock_config); + // Add an entropy device. + let entropy_config = EntropyDeviceConfig::default(); + insert_entropy_device(&mut vmm, &mut cmdline, &mut event_manager, entropy_config); + + Snapshot::serialize(&mut buf.as_mut_slice(), &vmm.device_manager.save()).unwrap(); + } + + tmp_sock_file.remove().unwrap(); + + let mut event_manager = EventManager::new().expect("Unable to create EventManager"); + // Keep in mind we are re-creating here an empty DeviceManager. Restoring later on + // will create a new PciDevices manager different than vmm.pci_devices. We're doing + // this to avoid restoring the whole Vmm, since what we really need from Vmm is the Vm + // object and calling default_vmm() is the easiest way to create one. + let vmm = default_vmm(); + let device_manager_state: device_manager::DevicesState = + Snapshot::deserialize(&mut buf.as_slice()).unwrap(); + let vm_resources = &mut VmResources::default(); + let restore_args = PciDevicesConstructorArgs { + vm: vmm.vm.clone(), + mem: vmm.vm.guest_memory(), + vm_resources, + instance_id: "microvm-id", + restored_from_file: true, + event_manager: &mut event_manager, + }; + let _restored_dev_manager = + PciDevices::restore(restore_args, &device_manager_state.pci_state).unwrap(); + + let expected_vm_resources = format!( + r#"{{ + "balloon": {{ + "amount_mib": 123, + "deflate_on_oom": false, + "stats_polling_interval_s": 1 + }}, + "drives": [ + {{ + "drive_id": "root", + "partuuid": null, + "is_root_device": true, + "cache_type": "Unsafe", + "is_read_only": true, + "path_on_host": "{}", + "rate_limiter": null, + "io_engine": "Sync", + "socket": null + }} + ], + "boot-source": {{ + "kernel_image_path": "", + "initrd_path": null, + "boot_args": null + }}, + "cpu-config": null, + "logger": null, + "machine-config": {{ + "vcpu_count": 1, + "mem_size_mib": 128, + "smt": false, + "track_dirty_pages": false, + "huge_pages": "None" + }}, + "metrics": null, + "mmds-config": {{ + "version": "V2", + "network_interfaces": [ + "netif" + ], + "ipv4_address": "169.254.169.254" + }}, + "network-interfaces": [ + {{ + "iface_id": "netif", + "host_dev_name": "hostname", + "guest_mac": null, + "rx_rate_limiter": null, + "tx_rate_limiter": null + }} + ], + "vsock": {{ + "guest_cid": 3, + "uds_path": "{}" + }}, + "entropy": {{ + "rate_limiter": null + }} +}}"#, + _block_files.last().unwrap().as_path().to_str().unwrap(), + tmp_sock_file.as_path().to_str().unwrap() + ); + + assert_eq!( + vm_resources + .mmds + .as_ref() + .unwrap() + .lock() + .unwrap() + .version(), + MmdsVersion::V2 + ); + assert_eq!( + device_manager_state.pci_state.mmds_version.unwrap(), + MmdsVersion::V2.into() + ); + assert_eq!( + expected_vm_resources, + serde_json::to_string_pretty(&VmmConfig::from(&*vm_resources)).unwrap() + ); + } +} From 0b68f8f8fb5c8518ba2770f6fff4cd51fc2689db Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Fri, 20 Jun 2025 14:14:58 +0200 Subject: [PATCH 52/56] test: enable PCI microVMs for performance testing Use pci_enabled fixture for boot time, block, and network tests to create PCI microVM variants as well. Signed-off-by: Babis Chalios --- tests/framework/microvm.py | 1 + .../performance/test_block.py | 3 ++- .../performance/test_boottime.py | 25 ++++++++++++++----- .../performance/test_network.py | 4 +-- .../performance/test_vsock.py | 3 ++- 5 files changed, 26 insertions(+), 10 deletions(-) diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py index fef1a68eb4b..b9a97c19591 100644 --- a/tests/framework/microvm.py +++ b/tests/framework/microvm.py @@ -481,6 +481,7 @@ def dimensions(self): "rootfs": self.rootfs_file.name, "vcpus": str(self.vcpus_count), "guest_memory": f"{self.mem_size_bytes / (1024 * 1024)}MB", + "pci": f"{self.pci_enabled}", } @property diff --git a/tests/integration_tests/performance/test_block.py b/tests/integration_tests/performance/test_block.py index dfd0728084a..7fe9216e559 100644 --- a/tests/integration_tests/performance/test_block.py +++ b/tests/integration_tests/performance/test_block.py @@ -168,6 +168,7 @@ def test_block_performance( fio_mode, fio_block_size, fio_engine, + pci_enabled, io_engine, metrics, results_dir, @@ -176,7 +177,7 @@ def test_block_performance( Execute block device emulation benchmarking scenarios. """ vm = microvm_factory.build(guest_kernel_acpi, rootfs, monitor_memory=False) - vm.spawn(log_level="Info", emit_metrics=True) + vm.spawn(log_level="Info", emit_metrics=True, pci=pci_enabled) vm.basic_config(vcpu_count=vcpus, mem_size_mib=GUEST_MEM_MIB) vm.add_net_iface() # Add a secondary block device for benchmark tests. diff --git a/tests/integration_tests/performance/test_boottime.py b/tests/integration_tests/performance/test_boottime.py index 7708451ec7f..4eb9a267475 100644 --- a/tests/integration_tests/performance/test_boottime.py +++ b/tests/integration_tests/performance/test_boottime.py @@ -95,12 +95,12 @@ def to_ms(v, unit): def launch_vm_with_boot_timer( - microvm_factory, guest_kernel_acpi, rootfs_rw, vcpu_count, mem_size_mib + microvm_factory, guest_kernel_acpi, rootfs_rw, vcpu_count, mem_size_mib, pci_enabled ): """Launches a microVM with guest-timer and returns the reported metrics for it""" vm = microvm_factory.build(guest_kernel_acpi, rootfs_rw) vm.jailer.extra_args.update({"boot-timer": None}) - vm.spawn() + vm.spawn(pci=pci_enabled) vm.basic_config( vcpu_count=vcpu_count, mem_size_mib=mem_size_mib, @@ -116,9 +116,11 @@ def launch_vm_with_boot_timer( return (vm, boot_time_us, cpu_boot_time_us) -def test_boot_timer(microvm_factory, guest_kernel_acpi, rootfs): +def test_boot_timer(microvm_factory, guest_kernel_acpi, rootfs, pci_enabled): """Tests that the boot timer device works""" - launch_vm_with_boot_timer(microvm_factory, guest_kernel_acpi, rootfs, 1, 128) + launch_vm_with_boot_timer( + microvm_factory, guest_kernel_acpi, rootfs, 1, 128, pci_enabled + ) @pytest.mark.parametrize( @@ -127,13 +129,24 @@ def test_boot_timer(microvm_factory, guest_kernel_acpi, rootfs): ) @pytest.mark.nonci def test_boottime( - microvm_factory, guest_kernel_acpi, rootfs_rw, vcpu_count, mem_size_mib, metrics + microvm_factory, + guest_kernel_acpi, + rootfs_rw, + vcpu_count, + mem_size_mib, + pci_enabled, + metrics, ): """Test boot time with different guest configurations""" for i in range(10): vm, boot_time_us, cpu_boot_time_us = launch_vm_with_boot_timer( - microvm_factory, guest_kernel_acpi, rootfs_rw, vcpu_count, mem_size_mib + microvm_factory, + guest_kernel_acpi, + rootfs_rw, + vcpu_count, + mem_size_mib, + pci_enabled, ) if i == 0: diff --git a/tests/integration_tests/performance/test_network.py b/tests/integration_tests/performance/test_network.py index 3355d54c2bc..4c2deba0041 100644 --- a/tests/integration_tests/performance/test_network.py +++ b/tests/integration_tests/performance/test_network.py @@ -38,7 +38,7 @@ def consume_ping_output(ping_putput, request_per_round): @pytest.fixture -def network_microvm(request, microvm_factory, guest_kernel_acpi, rootfs): +def network_microvm(request, microvm_factory, guest_kernel_acpi, rootfs, pci_enabled): """Creates a microvm with the networking setup used by the performance tests in this file. This fixture receives its vcpu count via indirect parameterization""" @@ -46,7 +46,7 @@ def network_microvm(request, microvm_factory, guest_kernel_acpi, rootfs): guest_vcpus = request.param vm = microvm_factory.build(guest_kernel_acpi, rootfs, monitor_memory=False) - vm.spawn(log_level="Info", emit_metrics=True) + vm.spawn(log_level="Info", emit_metrics=True, pci=pci_enabled) vm.basic_config(vcpu_count=guest_vcpus, mem_size_mib=guest_mem_mib) vm.add_net_iface() vm.start() diff --git a/tests/integration_tests/performance/test_vsock.py b/tests/integration_tests/performance/test_vsock.py index bad4436e568..5a023f53eea 100644 --- a/tests/integration_tests/performance/test_vsock.py +++ b/tests/integration_tests/performance/test_vsock.py @@ -80,6 +80,7 @@ def test_vsock_throughput( rootfs, vcpus, payload_length, + pci_enabled, mode, metrics, results_dir, @@ -95,7 +96,7 @@ def test_vsock_throughput( mem_size_mib = 1024 vm = microvm_factory.build(guest_kernel_acpi, rootfs, monitor_memory=False) - vm.spawn(log_level="Info", emit_metrics=True) + vm.spawn(log_level="Info", emit_metrics=True, pci=pci_enabled) vm.basic_config(vcpu_count=vcpus, mem_size_mib=mem_size_mib) vm.add_net_iface() # Create a vsock device From f9f0c08ddafc709c1ca8f2fa1102aac85929664d Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 25 Jun 2025 10:00:15 +0200 Subject: [PATCH 53/56] test: remove pci=off default from various parts in tests We only pass pci=off if PCI is disabled in Firecracker. Adapt tests and comments to reflect that. Signed-off-by: Babis Chalios --- tests/framework/microvm.py | 6 ++++-- tests/integration_tests/performance/test_boottime.py | 5 +++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py index b9a97c19591..5c936fd8395 100644 --- a/tests/framework/microvm.py +++ b/tests/framework/microvm.py @@ -784,8 +784,10 @@ def basic_config( the response is within the interval [200, 300). If boot_args is None, the default boot_args in Firecracker is - reboot=k panic=1 pci=off nomodule 8250.nr_uarts=0 - i8042.noaux i8042.nomux i8042.nopnp i8042.dumbkbd + reboot=k panic=1 nomodule 8250.nr_uarts=0 i8042.noaux i8042.nomux + i8042.nopnp i8042.dumbkbd + + if PCI is disabled, Firecracker also passes to the guest pci=off Reference: file:../../src/vmm/src/vmm_config/boot_source.rs::DEFAULT_KERNEL_CMDLINE """ diff --git a/tests/integration_tests/performance/test_boottime.py b/tests/integration_tests/performance/test_boottime.py index 4eb9a267475..173e352f67d 100644 --- a/tests/integration_tests/performance/test_boottime.py +++ b/tests/integration_tests/performance/test_boottime.py @@ -11,7 +11,7 @@ # Regex for obtaining boot time from some string. DEFAULT_BOOT_ARGS = ( - "reboot=k panic=1 pci=off nomodule 8250.nr_uarts=0" + "reboot=k panic=1 nomodule 8250.nr_uarts=0" " i8042.noaux i8042.nomux i8042.nopnp i8042.dumbkbd" ) @@ -98,13 +98,14 @@ def launch_vm_with_boot_timer( microvm_factory, guest_kernel_acpi, rootfs_rw, vcpu_count, mem_size_mib, pci_enabled ): """Launches a microVM with guest-timer and returns the reported metrics for it""" + boot_args = DEFAULT_BOOT_ARGS if pci_enabled else DEFAULT_BOOT_ARGS + " pci=off" vm = microvm_factory.build(guest_kernel_acpi, rootfs_rw) vm.jailer.extra_args.update({"boot-timer": None}) vm.spawn(pci=pci_enabled) vm.basic_config( vcpu_count=vcpu_count, mem_size_mib=mem_size_mib, - boot_args=DEFAULT_BOOT_ARGS + " init=/usr/local/bin/init", + boot_args=boot_args + " init=/usr/local/bin/init", enable_entropy_device=True, ) vm.add_net_iface() From d9fc9e07146a8cb08f2cf7c6177ebbb0625c299c Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Mon, 30 Jun 2025 12:26:48 +0200 Subject: [PATCH 54/56] virtio: add kick() method in VirtioDevice trait So that we don't have to downcast VirtioDevice trait objects to the actual device type before calling the logic to process events for each device. Signed-off-by: Babis Chalios --- src/vmm/src/device_manager/mod.rs | 89 +++----------------- src/vmm/src/devices/virtio/balloon/device.rs | 12 ++- src/vmm/src/devices/virtio/block/device.rs | 13 +++ src/vmm/src/devices/virtio/device.rs | 3 + src/vmm/src/devices/virtio/net/device.rs | 13 ++- src/vmm/src/devices/virtio/rng/device.rs | 8 ++ src/vmm/src/devices/virtio/vsock/device.rs | 15 +++- 7 files changed, 74 insertions(+), 79 deletions(-) diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index 2135711ea54..34d1ba73091 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -30,14 +30,8 @@ use crate::devices::legacy::RTCDevice; use crate::devices::legacy::serial::SerialOut; use crate::devices::legacy::{IER_RDA_BIT, IER_RDA_OFFSET, SerialDevice}; use crate::devices::pseudo::BootTimer; -use crate::devices::virtio::balloon::Balloon; -use crate::devices::virtio::block::device::Block; use crate::devices::virtio::device::VirtioDevice; -use crate::devices::virtio::net::Net; -use crate::devices::virtio::rng::Entropy; use crate::devices::virtio::transport::mmio::{IrqTrigger, MmioTransport}; -use crate::devices::virtio::vsock::{TYPE_VSOCK, Vsock, VsockUnixBackend}; -use crate::devices::virtio::{TYPE_BALLOON, TYPE_BLOCK, TYPE_NET, TYPE_RNG}; use crate::resources::VmResources; use crate::snapshot::Persist; use crate::vstate::memory::GuestMemoryMmap; @@ -265,85 +259,28 @@ impl DeviceManager { self.pci_devices.attach_pci_segment(vm) } - fn do_kick_device(virtio_device: Arc>) { - let mut device = virtio_device.lock().expect("Poisoned lock"); - match device.device_type() { - TYPE_BALLOON => { - let balloon = device.as_mut_any().downcast_mut::().unwrap(); - // If device is activated, kick the balloon queue(s) to make up for any - // pending or in-flight epoll events we may have not captured in snapshot. - // Stats queue doesn't need kicking as it is notified via a `timer_fd`. - if balloon.is_activated() { - info!("kick balloon {}.", balloon.id()); - balloon.process_virtio_queues().unwrap(); - } - } - TYPE_BLOCK => { - // We only care about kicking virtio block. - // If we need to kick vhost-user-block we can do nothing. - if let Some(block) = device.as_mut_any().downcast_mut::() { - // If device is activated, kick the block queue(s) to make up for any - // pending or in-flight epoll events we may have not captured in - // snapshot. No need to kick Ratelimiters - // because they are restored 'unblocked' so - // any inflight `timer_fd` events can be safely discarded. - if block.is_activated() { - info!("kick block {}.", block.id()); - block.process_virtio_queues().unwrap(); - } - } - } - TYPE_NET => { - let net = device.as_mut_any().downcast_mut::().unwrap(); - // If device is activated, kick the net queue(s) to make up for any - // pending or in-flight epoll events we may have not captured in snapshot. - // No need to kick Ratelimiters because they are restored 'unblocked' so - // any inflight `timer_fd` events can be safely discarded. - if net.is_activated() { - info!("kick net {}.", net.id()); - net.process_virtio_queues().unwrap(); - } - } - TYPE_VSOCK => { - // Vsock has complicated protocol that isn't resilient to any packet loss, - // so for Vsock we don't support connection persistence through snapshot. - // Any in-flight packets or events are simply lost. - // Vsock is restored 'empty'. - // The only reason we still `kick` it is to make guest process - // `TRANSPORT_RESET_EVENT` event we sent during snapshot creation. - let vsock = device - .as_mut_any() - .downcast_mut::>() - .unwrap(); - if vsock.is_activated() { - info!("kick vsock {}.", vsock.id()); - vsock.signal_used_queue(0).unwrap(); - } - } - TYPE_RNG => { - let entropy = device.as_mut_any().downcast_mut::().unwrap(); - if entropy.is_activated() { - info!("kick entropy {}.", entropy.id()); - entropy.process_virtio_queues().unwrap(); - } - } - _ => (), - } - } - /// Artificially kick VirtIO devices as if they had external events. pub fn kick_virtio_devices(&self) { info!("Artificially kick devices"); // Go through MMIO VirtIO devices let _: Result<(), MmioError> = self.mmio_devices.for_each_virtio_device(|_, _, device| { let mmio_transport_locked = device.inner.lock().expect("Poisoned lock"); - Self::do_kick_device(mmio_transport_locked.device()); + mmio_transport_locked + .device() + .lock() + .expect("Poisoned lock") + .kick(); Ok(()) }); // Go through PCI VirtIO devices - for device in self.pci_devices.virtio_devices.values() { - let virtio_device = device.lock().expect("Poisoned lock").virtio_device(); - Self::do_kick_device(virtio_device); + for virtio_pci_device in self.pci_devices.virtio_devices.values() { + virtio_pci_device + .lock() + .expect("Poisoned lock") + .virtio_device() + .lock() + .expect("Poisoned lock") + .kick(); } } diff --git a/src/vmm/src/devices/virtio/balloon/device.rs b/src/vmm/src/devices/virtio/balloon/device.rs index 8b06e8ea38f..ba9e0ed5b90 100644 --- a/src/vmm/src/devices/virtio/balloon/device.rs +++ b/src/vmm/src/devices/virtio/balloon/device.rs @@ -5,7 +5,7 @@ use std::ops::Deref; use std::sync::Arc; use std::time::Duration; -use log::error; +use log::{error, info}; use serde::Serialize; use timerfd::{ClockId, SetTimeFlags, TimerFd, TimerState}; use vmm_sys_util::eventfd::EventFd; @@ -621,6 +621,16 @@ impl VirtioDevice for Balloon { fn is_activated(&self) -> bool { self.device_state.is_activated() } + + fn kick(&mut self) { + // If device is activated, kick the balloon queue(s) to make up for any + // pending or in-flight epoll events we may have not captured in snapshot. + // Stats queue doesn't need kicking as it is notified via a `timer_fd`. + if self.is_activated() { + info!("kick balloon {}.", self.id()); + self.process_virtio_queues(); + } + } } #[cfg(test)] diff --git a/src/vmm/src/devices/virtio/block/device.rs b/src/vmm/src/devices/virtio/block/device.rs index d58550acc59..c1fa95f7b1c 100644 --- a/src/vmm/src/devices/virtio/block/device.rs +++ b/src/vmm/src/devices/virtio/block/device.rs @@ -4,6 +4,7 @@ use std::sync::Arc; use event_manager::{EventOps, Events, MutEventSubscriber}; +use log::info; use vmm_sys_util::eventfd::EventFd; use super::BlockError; @@ -214,6 +215,18 @@ impl VirtioDevice for Block { Self::VhostUser(b) => b.device_state.is_activated(), } } + + fn kick(&mut self) { + // If device is activated, kick the block queue(s) to make up for any + // pending or in-flight epoll events we may have not captured in + // snapshot. No need to kick Ratelimiters + // because they are restored 'unblocked' so + // any inflight `timer_fd` events can be safely discarded. + if self.is_activated() { + info!("kick block {}.", self.id()); + self.process_virtio_queues(); + } + } } impl MutEventSubscriber for Block { diff --git a/src/vmm/src/devices/virtio/device.rs b/src/vmm/src/devices/virtio/device.rs index 7b51a4b1dbf..ca3efc8cf2f 100644 --- a/src/vmm/src/devices/virtio/device.rs +++ b/src/vmm/src/devices/virtio/device.rs @@ -159,6 +159,9 @@ pub trait VirtioDevice: AsAny + Send { } Ok(()) } + + /// Kick the device, as if it had received external events. + fn kick(&mut self) {} } impl fmt::Debug for dyn VirtioDevice { diff --git a/src/vmm/src/devices/virtio/net/device.rs b/src/vmm/src/devices/virtio/net/device.rs index 4c6022a0067..0b2f3150c09 100755 --- a/src/vmm/src/devices/virtio/net/device.rs +++ b/src/vmm/src/devices/virtio/net/device.rs @@ -13,7 +13,7 @@ use std::ops::Deref; use std::sync::{Arc, Mutex}; use libc::{EAGAIN, iovec}; -use log::error; +use log::{error, info}; use vmm_sys_util::eventfd::EventFd; use super::NET_QUEUE_MAX_SIZE; @@ -1059,6 +1059,17 @@ impl VirtioDevice for Net { fn is_activated(&self) -> bool { self.device_state.is_activated() } + + fn kick(&mut self) { + // If device is activated, kick the net queue(s) to make up for any + // pending or in-flight epoll events we may have not captured in snapshot. + // No need to kick Ratelimiters because they are restored 'unblocked' so + // any inflight `timer_fd` events can be safely discarded. + if self.is_activated() { + info!("kick net {}.", self.id()); + self.process_virtio_queues(); + } + } } #[cfg(test)] diff --git a/src/vmm/src/devices/virtio/rng/device.rs b/src/vmm/src/devices/virtio/rng/device.rs index a0b98cdc8b7..2cf1c6bf5dd 100644 --- a/src/vmm/src/devices/virtio/rng/device.rs +++ b/src/vmm/src/devices/virtio/rng/device.rs @@ -6,6 +6,7 @@ use std::ops::Deref; use std::sync::Arc; use aws_lc_rs::rand; +use log::info; use vm_memory::GuestMemoryError; use vmm_sys_util::eventfd::EventFd; @@ -312,6 +313,13 @@ impl VirtioDevice for Entropy { self.device_state = DeviceState::Activated(ActiveState { mem, interrupt }); Ok(()) } + + fn kick(&mut self) { + if self.is_activated() { + info!("kick entropy {}.", self.id()); + self.process_virtio_queues(); + } + } } #[cfg(test)] diff --git a/src/vmm/src/devices/virtio/vsock/device.rs b/src/vmm/src/devices/virtio/vsock/device.rs index 61ca3246d43..bef7fd0af4c 100644 --- a/src/vmm/src/devices/virtio/vsock/device.rs +++ b/src/vmm/src/devices/virtio/vsock/device.rs @@ -24,7 +24,7 @@ use std::fmt::Debug; use std::ops::Deref; use std::sync::Arc; -use log::{error, warn}; +use log::{error, info, warn}; use vmm_sys_util::eventfd::EventFd; use super::super::super::DeviceError; @@ -368,6 +368,19 @@ where fn is_activated(&self) -> bool { self.device_state.is_activated() } + + fn kick(&mut self) { + // Vsock has complicated protocol that isn't resilient to any packet loss, + // so for Vsock we don't support connection persistence through snapshot. + // Any in-flight packets or events are simply lost. + // Vsock is restored 'empty'. + // The only reason we still `kick` it is to make guest process + // `TRANSPORT_RESET_EVENT` event we sent during snapshot creation. + if self.is_activated() { + info!("kick vsock {}.", self.id()); + self.signal_used_queue(0).unwrap(); + } + } } #[cfg(test)] From 7375a93c4215eb948a0b02c2f562473e474bc2ed Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Mon, 30 Jun 2025 18:33:40 +0200 Subject: [PATCH 55/56] refactor: simplify ResourceAllocator internals Instead of storing internal allocators of ResourceAllocator within an Arc> container, just store `ResourceAllocator` itself in an `Arc>`. Apart from that, we get rid of the `ResourceAllocatorState` state object, and just clone `ResourceAllocator` itself when we want to save/restore. Also, make the creation of `ResourceAllocato` infallible, since we know that the ranges we are using are correct. Finally, fix saving/restoring the state of ResourceAllocator. We were actually not resetting it correctly upon snapshot restore. The reason why this was not a problem is that we don't actually need to perform any new allocations post restore at the moment. However, like this we are ready when we need to perform any hot-plugging operations. Also, add a unit-test to ensure that this logic works correctly. Signed-off-by: Babis Chalios --- src/pci/Cargo.toml | 2 +- src/vmm/src/acpi/mod.rs | 33 +++---- src/vmm/src/arch/aarch64/vm.rs | 7 ++ src/vmm/src/arch/x86_64/mod.rs | 20 ++-- src/vmm/src/arch/x86_64/mptable.rs | 31 +++--- src/vmm/src/arch/x86_64/vm.rs | 9 +- src/vmm/src/builder.rs | 6 +- src/vmm/src/device_manager/mmio.rs | 20 ++-- src/vmm/src/device_manager/mod.rs | 2 +- src/vmm/src/device_manager/pci_mngr.rs | 21 ++-- src/vmm/src/device_manager/persist.rs | 2 +- src/vmm/src/devices/acpi/vmgenid.rs | 4 +- src/vmm/src/devices/pci/pci_segment.rs | 11 +-- src/vmm/src/vstate/resources.rs | 127 ++++++++----------------- src/vmm/src/vstate/vm.rs | 58 ++++++++++- 15 files changed, 182 insertions(+), 171 deletions(-) diff --git a/src/pci/Cargo.toml b/src/pci/Cargo.toml index 3549d5010fe..d179854f391 100644 --- a/src/pci/Cargo.toml +++ b/src/pci/Cargo.toml @@ -18,7 +18,7 @@ libc = "0.2.172" log = "0.4.27" serde = { version = "1.0.219", features = ["derive"] } thiserror = "2.0.12" -vm-allocator = "0.1.2" +vm-allocator = "0.1.3" vm-device = { path = "../vm-device" } vm-memory = { version = "0.16.1", features = [ "backend-mmap", diff --git a/src/vmm/src/acpi/mod.rs b/src/vmm/src/acpi/mod.rs index 51711d9eb92..f3b4164745a 100644 --- a/src/vmm/src/acpi/mod.rs +++ b/src/vmm/src/acpi/mod.rs @@ -54,7 +54,7 @@ impl AcpiTableWriter<'_> { /// buffer. It returns the address in which it wrote the table. fn write_acpi_table( &mut self, - resource_allocator: &ResourceAllocator, + resource_allocator: &mut ResourceAllocator, table: &mut S, ) -> Result where @@ -83,7 +83,7 @@ impl AcpiTableWriter<'_> { fn build_dsdt( &mut self, device_manager: &mut DeviceManager, - resource_allocator: &ResourceAllocator, + resource_allocator: &mut ResourceAllocator, ) -> Result { let mut dsdt_data = Vec::new(); @@ -111,7 +111,7 @@ impl AcpiTableWriter<'_> { /// This includes a pointer with the location of the DSDT in guest memory fn build_fadt( &mut self, - resource_allocator: &ResourceAllocator, + resource_allocator: &mut ResourceAllocator, dsdt_addr: u64, ) -> Result { let mut fadt = Fadt::new(OEM_ID, *b"FCVMFADT", OEM_REVISION); @@ -129,7 +129,7 @@ impl AcpiTableWriter<'_> { /// This includes information about the interrupt controllers supported in the platform fn build_madt( &mut self, - resource_allocator: &ResourceAllocator, + resource_allocator: &mut ResourceAllocator, nr_vcpus: u8, ) -> Result { let mut madt = Madt::new( @@ -147,7 +147,7 @@ impl AcpiTableWriter<'_> { /// Currently, we pass to the guest just FADT and MADT tables. fn build_xsdt( &mut self, - resource_allocator: &ResourceAllocator, + resource_allocator: &mut ResourceAllocator, fadt_addr: u64, madt_addr: u64, mcfg_addr: u64, @@ -164,7 +164,7 @@ impl AcpiTableWriter<'_> { /// Build the MCFG table for the guest. fn build_mcfg( &mut self, - resource_allocator: &ResourceAllocator, + resource_allocator: &mut ResourceAllocator, pci_mmio_config_addr: u64, ) -> Result { let mut mcfg = Mcfg::new(OEM_ID, *b"FCMVMCFG", OEM_REVISION, pci_mmio_config_addr); @@ -197,7 +197,7 @@ impl AcpiTableWriter<'_> { pub(crate) fn create_acpi_tables( mem: &GuestMemoryMmap, device_manager: &mut DeviceManager, - resource_allocator: &ResourceAllocator, + resource_allocator: &mut ResourceAllocator, vcpus: &[Vcpu], ) -> Result<(), AcpiError> { let mut writer = AcpiTableWriter { mem }; @@ -249,18 +249,19 @@ mod tests { let mut writer = AcpiTableWriter { mem: vmm.vm.guest_memory(), }; + let mut resource_allocator = vmm.vm.resource_allocator(); // This should succeed let mut sdt = MockSdt(vec![0; 4096]); let addr = writer - .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) + .write_acpi_table(&mut resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START); // Let's try to write two 4K pages plus one byte let mut sdt = MockSdt(vec![0; usize::try_from(SYSTEM_MEM_SIZE + 1).unwrap()]); let err = writer - .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) + .write_acpi_table(&mut resource_allocator, &mut sdt) .unwrap_err(); assert!( matches!( @@ -275,27 +276,27 @@ mod tests { // succeed. let mut sdt = MockSdt(vec![0; 5]); let addr = writer - .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) + .write_acpi_table(&mut resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4096); let mut sdt = MockSdt(vec![0; 2]); let addr = writer - .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) + .write_acpi_table(&mut resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4101); let mut sdt = MockSdt(vec![0; 4]); let addr = writer - .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) + .write_acpi_table(&mut resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4103); let mut sdt = MockSdt(vec![0; 8]); let addr = writer - .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) + .write_acpi_table(&mut resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4107); let mut sdt = MockSdt(vec![0; 16]); let addr = writer - .write_acpi_table(&vmm.vm.common.resource_allocator, &mut sdt) + .write_acpi_table(&mut resource_allocator, &mut sdt) .unwrap(); assert_eq!(addr, SYSTEM_MEM_START + 4115); } @@ -312,11 +313,11 @@ mod tests { let mut writer = AcpiTableWriter { mem: vm.guest_memory(), }; - let resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); let mut sdt = MockSdt(vec![0; usize::try_from(SYSTEM_MEM_SIZE).unwrap()]); let err = writer - .write_acpi_table(&resource_allocator, &mut sdt) + .write_acpi_table(&mut resource_allocator, &mut sdt) .unwrap_err(); assert!( matches!( diff --git a/src/vmm/src/arch/aarch64/vm.rs b/src/vmm/src/arch/aarch64/vm.rs index e54723f5b6d..eaec0932a42 100644 --- a/src/vmm/src/arch/aarch64/vm.rs +++ b/src/vmm/src/arch/aarch64/vm.rs @@ -1,11 +1,14 @@ // Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +use std::sync::Mutex; + use serde::{Deserialize, Serialize}; use crate::Kvm; use crate::arch::aarch64::gic::GicState; use crate::vstate::memory::{GuestMemoryExtension, GuestMemoryState}; +use crate::vstate::resources::ResourceAllocator; use crate::vstate::vm::{VmCommon, VmError}; /// Structure representing the current architecture's understand of what a "virtual machine" is. @@ -74,6 +77,7 @@ impl ArchVm { .get_irqchip() .save_device(mpidrs) .map_err(ArchVmError::SaveGic)?, + resource_allocator: self.resource_allocator().clone(), }) } @@ -86,6 +90,7 @@ impl ArchVm { self.get_irqchip() .restore_device(mpidrs, &state.gic) .map_err(ArchVmError::RestoreGic)?; + self.common.resource_allocator = Mutex::new(state.resource_allocator.clone()); Ok(()) } @@ -98,4 +103,6 @@ pub struct VmState { pub memory: GuestMemoryState, /// GIC state. pub gic: GicState, + /// resource allocator + pub resource_allocator: ResourceAllocator, } diff --git a/src/vmm/src/arch/x86_64/mod.rs b/src/vmm/src/arch/x86_64/mod.rs index 5307dbdf710..1822abb9009 100644 --- a/src/vmm/src/arch/x86_64/mod.rs +++ b/src/vmm/src/arch/x86_64/mod.rs @@ -217,7 +217,7 @@ pub fn configure_system_for_boot( // Note that this puts the mptable at the last 1k of Linux's 640k base RAM mptable::setup_mptable( vm.guest_memory(), - &vm.common.resource_allocator, + &mut vm.resource_allocator(), vcpu_config.vcpu_count, ) .map_err(ConfigurationError::MpTableSetup)?; @@ -241,7 +241,7 @@ pub fn configure_system_for_boot( create_acpi_tables( vm.guest_memory(), device_manager, - &vm.common.resource_allocator, + &mut vm.resource_allocator(), vcpus, )?; Ok(()) @@ -607,8 +607,8 @@ mod tests { fn test_system_configuration() { let no_vcpus = 4; let gm = single_region_mem(0x10000); - let resource_allocator = ResourceAllocator::new().unwrap(); - let err = mptable::setup_mptable(&gm, &resource_allocator, 1); + let mut resource_allocator = ResourceAllocator::new(); + let err = mptable::setup_mptable(&gm, &mut resource_allocator, 1); assert!(matches!( err.unwrap_err(), mptable::MptableError::NotEnoughMemory @@ -617,24 +617,24 @@ mod tests { // Now assigning some memory that falls before the 32bit memory hole. let mem_size = mib_to_bytes(128); let gm = arch_mem(mem_size); - let resource_allocator = ResourceAllocator::new().unwrap(); - mptable::setup_mptable(&gm, &resource_allocator, no_vcpus).unwrap(); + let mut resource_allocator = ResourceAllocator::new(); + mptable::setup_mptable(&gm, &mut resource_allocator, no_vcpus).unwrap(); configure_64bit_boot(&gm, GuestAddress(0), 0, &None).unwrap(); configure_pvh(&gm, GuestAddress(0), &None).unwrap(); // Now assigning some memory that is equal to the start of the 32bit memory hole. let mem_size = mib_to_bytes(3328); let gm = arch_mem(mem_size); - let resource_allocator = ResourceAllocator::new().unwrap(); - mptable::setup_mptable(&gm, &resource_allocator, no_vcpus).unwrap(); + let mut resource_allocator = ResourceAllocator::new(); + mptable::setup_mptable(&gm, &mut resource_allocator, no_vcpus).unwrap(); configure_64bit_boot(&gm, GuestAddress(0), 0, &None).unwrap(); configure_pvh(&gm, GuestAddress(0), &None).unwrap(); // Now assigning some memory that falls after the 32bit memory hole. let mem_size = mib_to_bytes(3330); let gm = arch_mem(mem_size); - let resource_allocator = ResourceAllocator::new().unwrap(); - mptable::setup_mptable(&gm, &resource_allocator, no_vcpus).unwrap(); + let mut resource_allocator = ResourceAllocator::new(); + mptable::setup_mptable(&gm, &mut resource_allocator, no_vcpus).unwrap(); configure_64bit_boot(&gm, GuestAddress(0), 0, &None).unwrap(); configure_pvh(&gm, GuestAddress(0), &None).unwrap(); } diff --git a/src/vmm/src/arch/x86_64/mptable.rs b/src/vmm/src/arch/x86_64/mptable.rs index 17b2900aeb2..a4b1e2fa632 100644 --- a/src/vmm/src/arch/x86_64/mptable.rs +++ b/src/vmm/src/arch/x86_64/mptable.rs @@ -116,7 +116,7 @@ fn compute_mp_size(num_cpus: u8) -> usize { /// Performs setup of the MP table for the given `num_cpus`. pub fn setup_mptable( mem: &GuestMemoryMmap, - resource_allocator: &ResourceAllocator, + resource_allocator: &mut ResourceAllocator, num_cpus: u8, ) -> Result<(), MptableError> { if num_cpus > MAX_SUPPORTED_CPUS { @@ -334,27 +334,27 @@ mod tests { fn bounds_check() { let num_cpus = 4; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(num_cpus)); - let resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); - setup_mptable(&mem, &resource_allocator, num_cpus).unwrap(); + setup_mptable(&mem, &mut resource_allocator, num_cpus).unwrap(); } #[test] fn bounds_check_fails() { let num_cpus = 4; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(num_cpus) - 1); - let resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); - setup_mptable(&mem, &resource_allocator, num_cpus).unwrap_err(); + setup_mptable(&mem, &mut resource_allocator, num_cpus).unwrap_err(); } #[test] fn mpf_intel_checksum() { let num_cpus = 1; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(num_cpus)); - let resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); - setup_mptable(&mem, &resource_allocator, num_cpus).unwrap(); + setup_mptable(&mem, &mut resource_allocator, num_cpus).unwrap(); let mpf_intel: mpspec::mpf_intel = mem.read_obj(GuestAddress(SYSTEM_MEM_START)).unwrap(); @@ -365,9 +365,9 @@ mod tests { fn mpc_table_checksum() { let num_cpus = 4; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(num_cpus)); - let resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); - setup_mptable(&mem, &resource_allocator, num_cpus).unwrap(); + setup_mptable(&mem, &mut resource_allocator, num_cpus).unwrap(); let mpf_intel: mpspec::mpf_intel = mem.read_obj(GuestAddress(SYSTEM_MEM_START)).unwrap(); let mpc_offset = GuestAddress(u64::from(mpf_intel.physptr)); @@ -388,9 +388,9 @@ mod tests { fn mpc_entry_count() { let num_cpus = 1; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(num_cpus)); - let resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); - setup_mptable(&mem, &resource_allocator, num_cpus).unwrap(); + setup_mptable(&mem, &mut resource_allocator, num_cpus).unwrap(); let mpf_intel: mpspec::mpf_intel = mem.read_obj(GuestAddress(SYSTEM_MEM_START)).unwrap(); let mpc_offset = GuestAddress(u64::from(mpf_intel.physptr)); @@ -419,8 +419,9 @@ mod tests { let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(MAX_SUPPORTED_CPUS)); for i in 0..MAX_SUPPORTED_CPUS { - let resource_allocator = ResourceAllocator::new().unwrap(); - setup_mptable(&mem, &resource_allocator, i).unwrap(); + let mut resource_allocator = ResourceAllocator::new(); + + setup_mptable(&mem, &mut resource_allocator, i).unwrap(); let mpf_intel: mpspec::mpf_intel = mem.read_obj(GuestAddress(SYSTEM_MEM_START)).unwrap(); @@ -450,9 +451,9 @@ mod tests { fn cpu_entry_count_max() { let cpus = MAX_SUPPORTED_CPUS + 1; let mem = single_region_mem_at(SYSTEM_MEM_START, compute_mp_size(cpus)); - let resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); - let result = setup_mptable(&mem, &resource_allocator, cpus).unwrap_err(); + let result = setup_mptable(&mem, &mut resource_allocator, cpus).unwrap_err(); assert_eq!(result, MptableError::TooManyCpus); } } diff --git a/src/vmm/src/arch/x86_64/vm.rs b/src/vmm/src/arch/x86_64/vm.rs index fbc27c82a60..e194296928d 100644 --- a/src/vmm/src/arch/x86_64/vm.rs +++ b/src/vmm/src/arch/x86_64/vm.rs @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use std::fmt; -use std::sync::Arc; +use std::sync::{Arc, Mutex}; use kvm_bindings::{ KVM_CLOCK_TSC_STABLE, KVM_IRQCHIP_IOAPIC, KVM_IRQCHIP_PIC_MASTER, KVM_IRQCHIP_PIC_SLAVE, @@ -15,7 +15,7 @@ use crate::arch::x86_64::msr::MsrError; use crate::snapshot::Persist; use crate::utils::u64_to_usize; use crate::vstate::memory::{GuestMemoryExtension, GuestMemoryState}; -use crate::vstate::resources::ResourceAllocatorState; +use crate::vstate::resources::ResourceAllocator; use crate::vstate::vm::{VmCommon, VmError}; /// Error type for [`Vm::restore_state`] @@ -142,6 +142,7 @@ impl ArchVm { self.fd() .set_irqchip(&state.ioapic) .map_err(ArchVmError::SetIrqChipIoAPIC)?; + self.common.resource_allocator = Mutex::new(state.resource_allocator.clone()); Ok(()) } @@ -195,7 +196,7 @@ impl ArchVm { Ok(VmState { memory: self.common.guest_memory.describe(), - resource_allocator: self.common.resource_allocator.save(), + resource_allocator: self.resource_allocator().save(), pitstate, clock, pic_master, @@ -221,7 +222,7 @@ pub struct VmState { /// guest memory state pub memory: GuestMemoryState, /// resource allocator - pub resource_allocator: ResourceAllocatorState, + pub resource_allocator: ResourceAllocator, pitstate: kvm_pit_state2, clock: kvm_clock_data, // TODO: rename this field to adopt inclusive language once Linux updates it, too. diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index e196ef505c2..b9e5471402c 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -254,7 +254,7 @@ pub fn build_microvm_for_boot( #[cfg(target_arch = "aarch64")] if vcpus[0].kvm_vcpu.supports_pvtime() { - setup_pvtime(&vm.common.resource_allocator, &mut vcpus)?; + setup_pvtime(&mut vm.resource_allocator(), &mut vcpus)?; } else { log::warn!("Vcpus do not support pvtime, steal time will not be reported to guest"); } @@ -515,7 +515,7 @@ const STEALTIME_STRUCT_MEM_SIZE: u64 = 64; /// Helper method to allocate steal time region #[cfg(target_arch = "aarch64")] fn allocate_pvtime_region( - resource_allocator: &ResourceAllocator, + resource_allocator: &mut ResourceAllocator, vcpu_count: usize, policy: vm_allocator::AllocPolicy, ) -> Result { @@ -529,7 +529,7 @@ fn allocate_pvtime_region( /// Sets up pvtime for all vcpus #[cfg(target_arch = "aarch64")] fn setup_pvtime( - resource_allocator: &ResourceAllocator, + resource_allocator: &mut ResourceAllocator, vcpus: &mut [Vcpu], ) -> Result<(), StartMicrovmError> { // Alloc sys mem for steal time region diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 13ab13f47ea..fe32376ebb4 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -154,7 +154,7 @@ impl MMIODeviceManager { /// Allocates resources for a new device to be added. fn allocate_mmio_resources( &mut self, - resource_allocator: &ResourceAllocator, + resource_allocator: &mut ResourceAllocator, irq_count: u32, ) -> Result { let irq = match resource_allocator.allocate_gsi(irq_count)?[..] { @@ -243,7 +243,7 @@ impl MMIODeviceManager { _cmdline: &mut kernel_cmdline::Cmdline, ) -> Result<(), MmioError> { let device = MMIODevice { - resources: self.allocate_mmio_resources(&vm.common.resource_allocator, 1)?, + resources: self.allocate_mmio_resources(&mut vm.resource_allocator(), 1)?, inner: Arc::new(Mutex::new(mmio_device)), }; @@ -277,7 +277,7 @@ impl MMIODeviceManager { let device_info = if let Some(device_info) = device_info_opt { device_info } else { - let gsi = vm.common.resource_allocator.allocate_gsi(1)?; + let gsi = vm.resource_allocator().allocate_gsi(1)?; MMIODeviceInfo { addr: SERIAL_MEM_START, len: MMIO_LEN, @@ -336,7 +336,7 @@ impl MMIODeviceManager { let device_info = if let Some(device_info) = device_info_opt { device_info } else { - let gsi = vm.common.resource_allocator.allocate_gsi(1)?; + let gsi = vm.resource_allocator().allocate_gsi(1)?; MMIODeviceInfo { addr: RTC_MEM_START, len: MMIO_LEN, @@ -754,10 +754,10 @@ pub(crate) mod tests { #[test] fn test_no_irq_allocation() { let mut device_manager = MMIODeviceManager::new(); - let resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); let device_info = device_manager - .allocate_mmio_resources(&resource_allocator, 0) + .allocate_mmio_resources(&mut resource_allocator, 0) .unwrap(); assert!(device_info.irq.is_none()); } @@ -765,10 +765,10 @@ pub(crate) mod tests { #[test] fn test_irq_allocation() { let mut device_manager = MMIODeviceManager::new(); - let resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); let device_info = device_manager - .allocate_mmio_resources(&resource_allocator, 1) + .allocate_mmio_resources(&mut resource_allocator, 1) .unwrap(); assert_eq!(device_info.irq.unwrap().get(), crate::arch::IRQ_BASE); } @@ -776,12 +776,12 @@ pub(crate) mod tests { #[test] fn test_allocation_failure() { let mut device_manager = MMIODeviceManager::new(); - let resource_allocator = ResourceAllocator::new().unwrap(); + let mut resource_allocator = ResourceAllocator::new(); assert_eq!( format!( "{}", device_manager - .allocate_mmio_resources(&resource_allocator, 2) + .allocate_mmio_resources(&mut resource_allocator, 2) .unwrap_err() ), "Invalid MMIO IRQ configuration.".to_string() diff --git a/src/vmm/src/device_manager/mod.rs b/src/vmm/src/device_manager/mod.rs index 34d1ba73091..cfc7fe44d79 100644 --- a/src/vmm/src/device_manager/mod.rs +++ b/src/vmm/src/device_manager/mod.rs @@ -221,7 +221,7 @@ impl DeviceManager { mem: &GuestMemoryMmap, vm: &Vm, ) -> Result<(), AttachDeviceError> { - let vmgenid = VmGenId::new(mem, &vm.common.resource_allocator)?; + let vmgenid = VmGenId::new(mem, &mut vm.resource_allocator())?; self.acpi_devices.attach_vmgenid(vmgenid, vm)?; Ok(()) } diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs index 5c09085e84d..1fc5abf52ef 100644 --- a/src/vmm/src/device_manager/pci_mngr.rs +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -3,6 +3,7 @@ use std::collections::HashMap; use std::fmt::Debug; +use std::ops::DerefMut; use std::sync::{Arc, Mutex}; use event_manager::{MutEventSubscriber, SubscriberOps}; @@ -127,7 +128,7 @@ impl PciDevices { let pci_device_bdf = pci_segment.next_device_bdf()?; debug!("Allocating BDF: {pci_device_bdf:?} for device"); let mem = vm.guest_memory().clone(); - let resource_allocator = &vm.common.resource_allocator; + let device_type: u32 = device.lock().expect("Poisoned lock").device_type(); // Allocate one MSI vector per queue, plus one for configuration @@ -141,16 +142,14 @@ impl PciDevices { VirtioPciDevice::new(id.clone(), mem, device, msix_vectors, pci_device_bdf.into())?; // Allocate bars - let mut mmio32_allocator = resource_allocator - .mmio32_memory - .lock() - .expect("Poisoned lock"); - let mut mmio64_allocator = resource_allocator - .mmio64_memory - .lock() - .expect("Poisoned lock"); - - virtio_device.allocate_bars(&mut mmio32_allocator, &mut mmio64_allocator, None)?; + let mut resource_allocator_lock = vm.resource_allocator(); + let resource_allocator = resource_allocator_lock.deref_mut(); + + virtio_device.allocate_bars( + &mut resource_allocator.mmio32_memory, + &mut resource_allocator.mmio64_memory, + None, + )?; let virtio_device = Arc::new(Mutex::new(virtio_device)); pci_segment diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 87358181df9..f75a14d4a29 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -220,7 +220,7 @@ impl<'a> Persist<'a> for ACPIDeviceManager { let vmgenid = VmGenId::restore( VMGenIdConstructorArgs { mem: constructor_args.mem, - resource_allocator: &constructor_args.vm.common.resource_allocator, + resource_allocator: &mut constructor_args.vm.resource_allocator(), }, vmgenid_args, )?; diff --git a/src/vmm/src/devices/acpi/vmgenid.rs b/src/vmm/src/devices/acpi/vmgenid.rs index 5c8d4ecbc51..6d096007193 100644 --- a/src/vmm/src/devices/acpi/vmgenid.rs +++ b/src/vmm/src/devices/acpi/vmgenid.rs @@ -86,7 +86,7 @@ impl VmGenId { /// Allocate memory and a GSI for sending notifications and build the device pub fn new( mem: &GuestMemoryMmap, - resource_allocator: &ResourceAllocator, + resource_allocator: &mut ResourceAllocator, ) -> Result { let gsi = resource_allocator.allocate_gsi(1)?; // The generation ID needs to live in an 8-byte aligned buffer @@ -133,7 +133,7 @@ pub struct VMGenIDState { #[derive(Debug)] pub struct VMGenIdConstructorArgs<'a> { pub mem: &'a GuestMemoryMmap, - pub resource_allocator: &'a ResourceAllocator, + pub resource_allocator: &'a mut ResourceAllocator, } impl<'a> Persist<'a> for VmGenId { diff --git a/src/vmm/src/devices/pci/pci_segment.rs b/src/vmm/src/devices/pci/pci_segment.rs index c37763eab3a..7deaa027f7b 100644 --- a/src/vmm/src/devices/pci/pci_segment.rs +++ b/src/vmm/src/devices/pci/pci_segment.rs @@ -80,14 +80,13 @@ impl PciSegment { PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, )?; - let mem32_allocator = vm.common.resource_allocator.mmio32_memory.clone(); - let mem64_allocator = vm.common.resource_allocator.mmio64_memory.clone(); + let resource_allocator = vm.resource_allocator(); - let start_of_mem32_area = mem32_allocator.lock().unwrap().base(); - let end_of_mem32_area = mem32_allocator.lock().unwrap().end(); + let start_of_mem32_area = resource_allocator.mmio32_memory.base(); + let end_of_mem32_area = resource_allocator.mmio32_memory.end(); - let start_of_mem64_area = mem64_allocator.lock().unwrap().base(); - let end_of_mem64_area = mem64_allocator.lock().unwrap().end(); + let start_of_mem64_area = resource_allocator.mmio64_memory.base(); + let end_of_mem64_area = resource_allocator.mmio64_memory.end(); let segment = PciSegment { id, diff --git a/src/vmm/src/vstate/resources.rs b/src/vmm/src/vstate/resources.rs index 3d8d8016e97..8b0cb4a67c4 100644 --- a/src/vmm/src/vstate/resources.rs +++ b/src/vmm/src/vstate/resources.rs @@ -2,7 +2,6 @@ // SPDX-License-Identifier: Apache-2.0 use std::convert::Infallible; -use std::sync::{Arc, Mutex}; use serde::{Deserialize, Serialize}; pub use vm_allocator::AllocPolicy; @@ -18,36 +17,44 @@ use crate::snapshot::Persist; /// * GSIs for legacy x86_64 devices /// * GSIs for MMIO devicecs /// * Memory allocations in the MMIO address space -#[derive(Debug)] +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct ResourceAllocator { /// Allocator for device interrupt lines - pub gsi_allocator: Arc>, + pub gsi_allocator: IdAllocator, /// Allocator for memory in the 32-bit MMIO address space - pub mmio32_memory: Arc>, + pub mmio32_memory: AddressAllocator, /// Allocator for memory in the 64-bit MMIO address space - pub mmio64_memory: Arc>, + pub mmio64_memory: AddressAllocator, /// Memory allocator for system data - pub system_memory: Arc>, + pub system_memory: AddressAllocator, +} + +impl Default for ResourceAllocator { + fn default() -> Self { + ResourceAllocator::new() + } } impl ResourceAllocator { /// Create a new resource allocator for Firecracker devices - pub fn new() -> Result { - Ok(Self { - gsi_allocator: Arc::new(Mutex::new(IdAllocator::new(arch::IRQ_BASE, arch::IRQ_MAX)?)), - mmio32_memory: Arc::new(Mutex::new(AddressAllocator::new( + pub fn new() -> Self { + // It is fine for us to unwrap the following since we know we are passing valid ranges for + // all allocators + Self { + gsi_allocator: IdAllocator::new(arch::IRQ_BASE, arch::IRQ_MAX).unwrap(), + mmio32_memory: AddressAllocator::new( arch::MEM_32BIT_DEVICES_START, arch::MEM_32BIT_DEVICES_SIZE, - )?)), - mmio64_memory: Arc::new(Mutex::new(AddressAllocator::new( + ) + .unwrap(), + mmio64_memory: AddressAllocator::new( arch::MEM_64BIT_DEVICES_START, arch::MEM_64BIT_DEVICES_SIZE, - )?)), - system_memory: Arc::new(Mutex::new(AddressAllocator::new( - arch::SYSTEM_MEM_START, - arch::SYSTEM_MEM_SIZE, - )?)), - }) + ) + .unwrap(), + system_memory: AddressAllocator::new(arch::SYSTEM_MEM_START, arch::SYSTEM_MEM_SIZE) + .unwrap(), + } } /// Allocate a number of GSIs @@ -55,17 +62,16 @@ impl ResourceAllocator { /// # Arguments /// /// * `gsi_count` - The number of GSIs to allocate - pub fn allocate_gsi(&self, gsi_count: u32) -> Result, vm_allocator::Error> { - let mut gsi_allocator = self.gsi_allocator.lock().expect("Poisoned lock"); + pub fn allocate_gsi(&mut self, gsi_count: u32) -> Result, vm_allocator::Error> { let mut gsis = Vec::with_capacity(gsi_count as usize); for _ in 0..gsi_count { - match gsi_allocator.allocate_id() { + match self.gsi_allocator.allocate_id() { Ok(gsi) => gsis.push(gsi), Err(err) => { // It is ok to unwrap here, we just allocated the GSI gsis.into_iter().for_each(|gsi| { - gsi_allocator.free_id(gsi).unwrap(); + self.gsi_allocator.free_id(gsi).unwrap(); }); return Err(err); } @@ -85,15 +91,13 @@ impl ResourceAllocator { /// * `alignment` - The alignment of the address of the first byte /// * `policy` - A [`vm_allocator::AllocPolicy`] variant for determining the allocation policy pub fn allocate_32bit_mmio_memory( - &self, + &mut self, size: u64, alignment: u64, policy: AllocPolicy, ) -> Result { Ok(self .mmio32_memory - .lock() - .expect("Poisoned lock") .allocate(size, alignment, policy)? .start()) } @@ -108,15 +112,13 @@ impl ResourceAllocator { /// * `alignment` - The alignment of the address of the first byte /// * `policy` - A [`vm_allocator::AllocPolicy`] variant for determining the allocation policy pub fn allocate_64bit_mmio_memory( - &self, + &mut self, size: u64, alignment: u64, policy: AllocPolicy, ) -> Result { Ok(self .mmio64_memory - .lock() - .expect("Poisoned lock") .allocate(size, alignment, policy)? .start()) } @@ -131,78 +133,32 @@ impl ResourceAllocator { /// * `alignment` - The alignment of the address of the first byte /// * `policy` - A [`vm_allocator::AllocPolicy`] variant for determining the allocation policy pub fn allocate_system_memory( - &self, + &mut self, size: u64, alignment: u64, policy: AllocPolicy, ) -> Result { Ok(self .system_memory - .lock() - .expect("Poisoned lock") .allocate(size, alignment, policy)? .start()) } } impl<'a> Persist<'a> for ResourceAllocator { - type State = ResourceAllocatorState; + type State = ResourceAllocator; type ConstructorArgs = (); type Error = Infallible; fn save(&self) -> Self::State { - ResourceAllocatorState { - gsi_allocator: self.gsi_allocator.clone(), - mmio32_memory: self.mmio32_memory.clone(), - mmio64_memory: self.mmio64_memory.clone(), - system_memory: self.system_memory.clone(), - } + self.clone() } fn restore( _constructor_args: Self::ConstructorArgs, state: &Self::State, ) -> std::result::Result { - Ok(ResourceAllocator { - gsi_allocator: state.gsi_allocator.clone(), - mmio32_memory: state.mmio32_memory.clone(), - mmio64_memory: state.mmio64_memory.clone(), - system_memory: state.system_memory.clone(), - }) - } -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -/// State of a ResourceAllocator -pub struct ResourceAllocatorState { - /// Allocator for device interrupt lines - pub gsi_allocator: Arc>, - /// Allocator for memory in the 32-bit MMIO address space - pub mmio32_memory: Arc>, - /// Allocator for memory in the 64-bit MMIO address space - pub mmio64_memory: Arc>, - /// Memory allocator for system data - pub system_memory: Arc>, -} - -impl Default for ResourceAllocatorState { - fn default() -> Self { - Self { - gsi_allocator: Arc::new(Mutex::new( - IdAllocator::new(arch::IRQ_BASE, arch::IRQ_MAX).unwrap(), - )), - mmio32_memory: Arc::new(Mutex::new( - AddressAllocator::new(arch::MEM_32BIT_DEVICES_START, arch::MEM_32BIT_DEVICES_SIZE) - .unwrap(), - )), - mmio64_memory: Arc::new(Mutex::new( - AddressAllocator::new(arch::MEM_64BIT_DEVICES_START, arch::MEM_64BIT_DEVICES_SIZE) - .unwrap(), - )), - system_memory: Arc::new(Mutex::new( - AddressAllocator::new(arch::SYSTEM_MEM_START, arch::SYSTEM_MEM_SIZE).unwrap(), - )), - } + Ok(state.clone()) } } @@ -210,7 +166,7 @@ impl Default for ResourceAllocatorState { mod tests { use vm_allocator::AllocPolicy; - use super::{ResourceAllocator, ResourceAllocatorState}; + use super::ResourceAllocator; use crate::arch::{self, IRQ_BASE}; use crate::snapshot::{Persist, Snapshot}; @@ -218,7 +174,7 @@ mod tests { #[test] fn test_allocate_gsi() { - let allocator = ResourceAllocator::new().unwrap(); + let mut allocator = ResourceAllocator::new(); // asking for 0 IRQs should return us an empty vector assert_eq!(allocator.allocate_gsi(0), Ok(vec![])); // We cannot allocate more GSIs than available @@ -239,7 +195,7 @@ mod tests { // But we should be able to ask for 0 GSIs assert_eq!(allocator.allocate_gsi(0), Ok(vec![])); - let allocator = ResourceAllocator::new().unwrap(); + let mut allocator = ResourceAllocator::new(); // We should be able to allocate 1 GSI assert_eq!(allocator.allocate_gsi(1), Ok(vec![arch::IRQ_BASE])); // We can't allocate MAX_IRQS any more @@ -258,18 +214,17 @@ mod tests { fn clone_allocator(allocator: &ResourceAllocator) -> ResourceAllocator { let mut buf = vec![0u8; 1024]; Snapshot::serialize(&mut buf.as_mut_slice(), &allocator.save()).unwrap(); - let restored_state: ResourceAllocatorState = - Snapshot::deserialize(&mut buf.as_slice()).unwrap(); + let restored_state: ResourceAllocator = Snapshot::deserialize(&mut buf.as_slice()).unwrap(); ResourceAllocator::restore((), &restored_state).unwrap() } #[test] fn test_save_restore() { - let allocator0 = ResourceAllocator::new().unwrap(); + let mut allocator0 = ResourceAllocator::new(); let gsi_0 = allocator0.allocate_gsi(1).unwrap()[0]; assert_eq!(gsi_0, IRQ_BASE); - let allocator1 = clone_allocator(&allocator0); + let mut allocator1 = clone_allocator(&allocator0); let gsi_1 = allocator1.allocate_gsi(1).unwrap()[0]; assert_eq!(gsi_1, IRQ_BASE + 1); let mmio32_mem = allocator1 @@ -285,7 +240,7 @@ mod tests { .unwrap(); assert_eq!(system_mem, arch::SYSTEM_MEM_START); - let allocator2 = clone_allocator(&allocator1); + let mut allocator2 = clone_allocator(&allocator1); allocator2 .allocate_32bit_mmio_memory(0x42, 1, AllocPolicy::ExactMatch(mmio32_mem)) .unwrap_err(); diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index 6bdfad5e37b..787cfa12cce 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -10,7 +10,7 @@ use std::fs::OpenOptions; use std::io::Write; use std::path::Path; use std::sync::atomic::{AtomicBool, Ordering}; -use std::sync::{Arc, Mutex}; +use std::sync::{Arc, Mutex, MutexGuard}; #[cfg(target_arch = "x86_64")] use kvm_bindings::KVM_IRQCHIP_IOAPIC; @@ -246,7 +246,7 @@ pub struct VmCommon { /// Interrupts used by Vm's devices pub interrupts: Mutex>, /// Allocator for VM resources - pub resource_allocator: Arc, + pub resource_allocator: Mutex, /// MMIO bus pub mmio_bus: Arc, } @@ -319,7 +319,7 @@ impl Vm { max_memslots: kvm.max_nr_memslots(), guest_memory: GuestMemoryMmap::default(), interrupts: Mutex::new(HashMap::new()), - resource_allocator: Arc::new(ResourceAllocator::new()?), + resource_allocator: Mutex::new(ResourceAllocator::new()), mmio_bus: Arc::new(vm_device::Bus::new()), }) } @@ -405,6 +405,14 @@ impl Vm { &self.common.guest_memory } + /// Gets a mutable reference to this [`Vm`]'s [`ResourceAllocator`] object + pub fn resource_allocator(&self) -> MutexGuard { + self.common + .resource_allocator + .lock() + .expect("Poisoned lock") + } + /// Resets the KVM dirty bitmap for each of the guest's memory regions. pub fn reset_dirty_bitmap(&self) { self.guest_memory() @@ -578,8 +586,7 @@ impl Vm { debug!("Creating new MSI group with {count} vectors"); let mut irq_routes = HashMap::with_capacity(count as usize); for (gsi, i) in vm - .common - .resource_allocator + .resource_allocator() .allocate_gsi(count as u32)? .iter() .zip(0u32..) @@ -628,6 +635,8 @@ pub(crate) mod tests { use vm_memory::mmap::MmapRegionBuilder; use super::*; + #[cfg(target_arch = "x86_64")] + use crate::snapshot::Snapshot; use crate::test_utils::single_region_mem_raw; use crate::utils::mib_to_bytes; use crate::vstate::kvm::Kvm; @@ -966,4 +975,43 @@ pub(crate) mod tests { assert!(!new_vector.enabled.load(Ordering::Acquire)); } } + + #[cfg(target_arch = "x86_64")] + #[test] + fn test_restore_state_resource_allocator() { + use vm_allocator::AllocPolicy; + + let mut snapshot_data = vec![0u8; 10000]; + let (_, mut vm) = setup_vm_with_memory(0x1000); + vm.setup_irqchip().unwrap(); + + // Allocate a GSI and some memory and make sure they are still allocated after restore + let (gsi, range) = { + let mut resource_allocator = vm.resource_allocator(); + + let gsi = resource_allocator.allocate_gsi(1).unwrap()[0]; + let range = resource_allocator + .allocate_32bit_mmio_memory(1024, 1024, AllocPolicy::FirstMatch) + .unwrap(); + (gsi, range) + }; + + let state = vm.save_state().unwrap(); + Snapshot::serialize(&mut snapshot_data.as_mut_slice(), &state).unwrap(); + + let restored_state: VmState = Snapshot::deserialize(&mut snapshot_data.as_slice()).unwrap(); + vm.restore_state(&restored_state).unwrap(); + + let mut resource_allocator = vm.resource_allocator(); + let gsi_new = resource_allocator.allocate_gsi(1).unwrap()[0]; + assert_eq!(gsi + 1, gsi_new); + + resource_allocator + .allocate_32bit_mmio_memory(1024, 1024, AllocPolicy::ExactMatch(range)) + .unwrap_err(); + let range_new = resource_allocator + .allocate_32bit_mmio_memory(1024, 1024, AllocPolicy::FirstMatch) + .unwrap(); + assert_eq!(range + 1024, range_new); + } } From 3181444306cd67b050befde1dee26535df0b525e Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Wed, 2 Jul 2025 16:17:48 +0200 Subject: [PATCH 56/56] fix(vsock): pass correct index when triggering interrupts We were confusing queue indexex with event indexes, when passing the index of the queue that needed to be triggered after handling events. Fix the logic to pass the correct index. This refactors a bit the code to signal the queues in each event handler method. With MMIO we don't need to signal each queue independently (one signal will cause the guest to scan all queues). With PCI though, we are using MSI-X, so we need to signal each queue independently. Also, change vsock functional integration tests to also run for PCI-enabled microVMs. Signed-off-by: Babis Chalios --- src/vmm/src/devices/virtio/vsock/device.rs | 2 +- .../src/devices/virtio/vsock/event_handler.rs | 78 ++++++++++--------- .../functional/test_vsock.py | 20 +++-- 3 files changed, 53 insertions(+), 47 deletions(-) diff --git a/src/vmm/src/devices/virtio/vsock/device.rs b/src/vmm/src/devices/virtio/vsock/device.rs index bef7fd0af4c..56426d1ea0f 100644 --- a/src/vmm/src/devices/virtio/vsock/device.rs +++ b/src/vmm/src/devices/virtio/vsock/device.rs @@ -378,7 +378,7 @@ where // `TRANSPORT_RESET_EVENT` event we sent during snapshot creation. if self.is_activated() { info!("kick vsock {}.", self.id()); - self.signal_used_queue(0).unwrap(); + self.signal_used_queue(EVQ_INDEX).unwrap(); } } } diff --git a/src/vmm/src/devices/virtio/vsock/event_handler.rs b/src/vmm/src/devices/virtio/vsock/event_handler.rs index b4445e298ae..e9e325c47e4 100755 --- a/src/vmm/src/devices/virtio/vsock/event_handler.rs +++ b/src/vmm/src/devices/virtio/vsock/event_handler.rs @@ -47,81 +47,82 @@ where const PROCESS_EVQ: u32 = 3; const PROCESS_NOTIFY_BACKEND: u32 = 4; - pub fn handle_rxq_event(&mut self, evset: EventSet) -> bool { + pub fn handle_rxq_event(&mut self, evset: EventSet) { if evset != EventSet::IN { warn!("vsock: rxq unexpected event {:?}", evset); METRICS.rx_queue_event_fails.inc(); - return false; + return; } - let mut raise_irq = false; if let Err(err) = self.queue_events[RXQ_INDEX].read() { error!("Failed to get vsock rx queue event: {:?}", err); METRICS.rx_queue_event_fails.inc(); } else if self.backend.has_pending_rx() { - // OK to unwrap: Only QueueError::InvalidAvailIdx is returned, and we explicitly - // want to panic on that one. - raise_irq |= self.process_rx().unwrap(); + if self.process_rx().unwrap() { + self.signal_used_queue(RXQ_INDEX) + .expect("vsock: Could not trigger device interrupt or RX queue"); + } METRICS.rx_queue_event_count.inc(); } - raise_irq } - pub fn handle_txq_event(&mut self, evset: EventSet) -> bool { + pub fn handle_txq_event(&mut self, evset: EventSet) { if evset != EventSet::IN { warn!("vsock: txq unexpected event {:?}", evset); METRICS.tx_queue_event_fails.inc(); - return false; + return; } - let mut raise_irq = false; if let Err(err) = self.queue_events[TXQ_INDEX].read() { error!("Failed to get vsock tx queue event: {:?}", err); METRICS.tx_queue_event_fails.inc(); } else { - // OK to unwrap: Only QueueError::InvalidAvailIdx is returned, and we explicitly - // want to panic on that one. - raise_irq |= self.process_tx().unwrap(); + if self.process_tx().unwrap() { + self.signal_used_queue(TXQ_INDEX) + .expect("vsock: Could not trigger device interrupt or TX queue"); + } METRICS.tx_queue_event_count.inc(); // The backend may have queued up responses to the packets we sent during // TX queue processing. If that happened, we need to fetch those responses // and place them into RX buffers. - if self.backend.has_pending_rx() { - raise_irq |= self.process_rx().unwrap(); + if self.backend.has_pending_rx() && self.process_rx().unwrap() { + self.signal_used_queue(RXQ_INDEX) + .expect("vsock: Could not trigger device interrupt or RX queue"); } } - raise_irq } - pub fn handle_evq_event(&mut self, evset: EventSet) -> bool { + pub fn handle_evq_event(&mut self, evset: EventSet) { if evset != EventSet::IN { warn!("vsock: evq unexpected event {:?}", evset); METRICS.ev_queue_event_fails.inc(); - return false; + return; } if let Err(err) = self.queue_events[EVQ_INDEX].read() { error!("Failed to consume vsock evq event: {:?}", err); METRICS.ev_queue_event_fails.inc(); } - false } /// Notify backend of new events. - pub fn notify_backend(&mut self, evset: EventSet) -> Result { + pub fn notify_backend(&mut self, evset: EventSet) -> Result<(), InvalidAvailIdx> { self.backend.notify(evset); // After the backend has been kicked, it might've freed up some resources, so we // can attempt to send it more data to process. // In particular, if `self.backend.send_pkt()` halted the TX queue processing (by // returning an error) at some point in the past, now is the time to try walking the // TX queue again. - // OK to unwrap: Only QueueError::InvalidAvailIdx is returned, and we explicitly - // want to panic on that one. - let mut raise_irq = self.process_tx()?; - if self.backend.has_pending_rx() { - raise_irq |= self.process_rx()?; + if self.process_tx()? { + self.signal_used_queue(TXQ_INDEX) + .expect("vsock: Could not trigger device interrupt or TX queue"); + } + if self.backend.has_pending_rx() && self.process_rx()? { + self.signal_used_queue(RXQ_INDEX) + .expect("vsock: Could not trigger device interrupt or RX queue"); } - Ok(raise_irq) + + Ok(()) } fn register_runtime_events(&self, ops: &mut EventOps) { @@ -189,19 +190,14 @@ where let evset = event.event_set(); if self.is_activated() { - let mut raise_irq = false; match source { Self::PROCESS_ACTIVATE => self.handle_activate_event(ops), - Self::PROCESS_RXQ => raise_irq = self.handle_rxq_event(evset), - Self::PROCESS_TXQ => raise_irq = self.handle_txq_event(evset), - Self::PROCESS_EVQ => raise_irq = self.handle_evq_event(evset), - Self::PROCESS_NOTIFY_BACKEND => raise_irq = self.notify_backend(evset).unwrap(), + Self::PROCESS_RXQ => self.handle_rxq_event(evset), + Self::PROCESS_TXQ => self.handle_txq_event(evset), + Self::PROCESS_EVQ => self.handle_evq_event(evset), + Self::PROCESS_NOTIFY_BACKEND => self.notify_backend(evset).unwrap(), _ => warn!("Unexpected vsock event received: {:?}", source), }; - if raise_irq { - self.signal_used_queue(source as usize) - .expect("vsock: Could not trigger device interrupt"); - } } else { warn!( "Vsock: The device is not yet activated. Spurious event received: {:?}", @@ -309,7 +305,9 @@ mod tests { let mut ctx = test_ctx.create_event_handler_context(); ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); - assert!(!ctx.device.handle_txq_event(EventSet::IN)); + let metric_before = METRICS.tx_queue_event_fails.count(); + ctx.device.handle_txq_event(EventSet::IN); + assert_eq!(metric_before + 1, METRICS.tx_queue_event_fails.count()); } } @@ -370,7 +368,9 @@ mod tests { let mut ctx = test_ctx.create_event_handler_context(); ctx.mock_activate(test_ctx.mem.clone(), test_ctx.interrupt.clone()); ctx.device.backend.set_pending_rx(false); - assert!(!ctx.device.handle_rxq_event(EventSet::IN)); + let metric_before = METRICS.rx_queue_event_fails.count(); + ctx.device.handle_rxq_event(EventSet::IN); + assert_eq!(metric_before + 1, METRICS.rx_queue_event_fails.count()); } } @@ -381,7 +381,9 @@ mod tests { let test_ctx = TestContext::new(); let mut ctx = test_ctx.create_event_handler_context(); ctx.device.backend.set_pending_rx(false); - assert!(!ctx.device.handle_evq_event(EventSet::IN)); + let metric_before = METRICS.ev_queue_event_fails.count(); + ctx.device.handle_evq_event(EventSet::IN); + assert_eq!(metric_before + 1, METRICS.ev_queue_event_fails.count()); } } diff --git a/tests/integration_tests/functional/test_vsock.py b/tests/integration_tests/functional/test_vsock.py index dfa02510b37..5b6221c32a9 100644 --- a/tests/integration_tests/functional/test_vsock.py +++ b/tests/integration_tests/functional/test_vsock.py @@ -37,7 +37,7 @@ TEST_WORKER_COUNT = 10 -def test_vsock(uvm_plain_any, bin_vsock_path, test_fc_session_root_path): +def test_vsock(uvm_plain_any, pci_enabled, bin_vsock_path, test_fc_session_root_path): """ Test guest and host vsock initiated connections. @@ -45,7 +45,7 @@ def test_vsock(uvm_plain_any, bin_vsock_path, test_fc_session_root_path): """ vm = uvm_plain_any - vm.spawn() + vm.spawn(pci=pci_enabled) vm.basic_config() vm.add_net_iface() @@ -102,12 +102,12 @@ def negative_test_host_connections(vm, blob_path, blob_hash): validate_fc_metrics(metrics) -def test_vsock_epipe(uvm_plain, bin_vsock_path, test_fc_session_root_path): +def test_vsock_epipe(uvm_plain, pci_enabled, bin_vsock_path, test_fc_session_root_path): """ Vsock negative test to validate SIGPIPE/EPIPE handling. """ vm = uvm_plain - vm.spawn() + vm.spawn(pci=pci_enabled) vm.basic_config() vm.add_net_iface() vm.api.vsock.put(vsock_id="vsock0", guest_cid=3, uds_path=f"/{VSOCK_UDS_PATH}") @@ -129,7 +129,7 @@ def test_vsock_epipe(uvm_plain, bin_vsock_path, test_fc_session_root_path): def test_vsock_transport_reset_h2g( - uvm_nano, microvm_factory, bin_vsock_path, test_fc_session_root_path + uvm_plain, pci_enabled, microvm_factory, bin_vsock_path, test_fc_session_root_path ): """ Vsock transport reset test. @@ -146,7 +146,9 @@ def test_vsock_transport_reset_h2g( 6. Close VM -> Load VM from Snapshot -> check that vsock device is still working. """ - test_vm = uvm_nano + test_vm = uvm_plain + test_vm.spawn(pci=pci_enabled) + test_vm.basic_config(vcpu_count=2, mem_size_mib=256) test_vm.add_net_iface() test_vm.api.vsock.put(vsock_id="vsock0", guest_cid=3, uds_path=f"/{VSOCK_UDS_PATH}") test_vm.start() @@ -213,11 +215,13 @@ def test_vsock_transport_reset_h2g( validate_fc_metrics(metrics) -def test_vsock_transport_reset_g2h(uvm_nano, microvm_factory): +def test_vsock_transport_reset_g2h(uvm_plain, pci_enabled, microvm_factory): """ Vsock transport reset test. """ - test_vm = uvm_nano + test_vm = uvm_plain + test_vm.spawn(pci=pci_enabled) + test_vm.basic_config(vcpu_count=2, mem_size_mib=256) test_vm.add_net_iface() test_vm.api.vsock.put(vsock_id="vsock0", guest_cid=3, uds_path=f"/{VSOCK_UDS_PATH}") test_vm.start()