diff --git a/net_util/src/lib.rs b/net_util/src/lib.rs index 6cf5791507..db396bc2da 100644 --- a/net_util/src/lib.rs +++ b/net_util/src/lib.rs @@ -101,7 +101,7 @@ fn create_unix_socket() -> Result { Ok(unsafe { net::UdpSocket::from_raw_fd(sock) }) } -fn vnet_hdr_len() -> usize { +pub fn vnet_hdr_len() -> usize { std::mem::size_of::() } diff --git a/virtio-devices/src/device.rs b/virtio-devices/src/device.rs index f0ed28f517..d749fd00a7 100644 --- a/virtio-devices/src/device.rs +++ b/virtio-devices/src/device.rs @@ -166,6 +166,12 @@ pub trait VirtioDevice: Send { /// Set the access platform trait to let the device perform address /// translations if needed. fn set_access_platform(&mut self, _access_platform: Arc) {} + + /// Some devices can announce their location after a live migration to + /// speed up normal execution. + fn post_migration_announcer(&self) -> Option> { + None + } } /// Trait to define address translation for devices managed by virtio-iommu @@ -338,3 +344,11 @@ impl Pausable for VirtioCommon { Ok(()) } } + +/// A PostMigrationAnnouncer is used to inform other devices about the new +/// location of a VM after a live migration. +pub trait PostMigrationAnnouncer: Send { + // Sending the announces is done on a best-effort basis, so we ignore + // errors. + fn announce(&mut self); +} diff --git a/virtio-devices/src/lib.rs b/virtio-devices/src/lib.rs index da4f1c91be..d2d428299d 100644 --- a/virtio-devices/src/lib.rs +++ b/virtio-devices/src/lib.rs @@ -42,8 +42,8 @@ pub use self::balloon::Balloon; pub use self::block::{Block, BlockState}; pub use self::console::{Console, ConsoleResizer, Endpoint}; pub use self::device::{ - DmaRemapping, VirtioCommon, VirtioDevice, VirtioInterrupt, VirtioInterruptType, - VirtioSharedMemoryList, + DmaRemapping, PostMigrationAnnouncer, VirtioCommon, VirtioDevice, VirtioInterrupt, + VirtioInterruptType, VirtioSharedMemoryList, }; pub use self::epoll_helper::{ EPOLL_HELPER_EVENT_LAST, EpollHelper, EpollHelperError, EpollHelperHandler, diff --git a/virtio-devices/src/net.rs b/virtio-devices/src/net.rs index 9b52786ca4..b2458aad23 100644 --- a/virtio-devices/src/net.rs +++ b/virtio-devices/src/net.rs @@ -20,8 +20,9 @@ use log::{debug, error, info, trace}; #[cfg(not(fuzzing))] use net_util::virtio_features_to_tap_offload; use net_util::{ - CtrlQueue, MacAddr, NetCounters, NetQueuePair, OpenTapError, RxVirtio, Tap, TapError, TxVirtio, - VirtioNetConfig, build_net_config_space, build_net_config_space_with_mq, open_tap, + CtrlQueue, MAC_ADDR_LEN, MacAddr, NetCounters, NetQueuePair, OpenTapError, RxVirtio, Tap, + TapError, TxVirtio, VirtioNetConfig, build_net_config_space, build_net_config_space_with_mq, + open_tap, vnet_hdr_len, }; use seccompiler::SeccompAction; use serde::{Deserialize, Serialize}; @@ -40,6 +41,7 @@ use super::{ EpollHelperHandler, Error as DeviceError, RateLimiterConfig, VirtioCommon, VirtioDevice, VirtioDeviceType, VirtioInterruptType, }; +use crate::device::PostMigrationAnnouncer; use crate::seccomp_filters::Thread; use crate::thread_helper::spawn_virtio_thread; use crate::{GuestMemoryMmap, VirtioInterrupt}; @@ -655,6 +657,38 @@ impl Net { pub fn wait_for_epoll_threads(&mut self) { self.common.wait_for_epoll_threads(); } + + fn build_rarp_announce(&self) -> [u8; 60] { + const ETH_P_RARP: u16 = 0x8035; // Ethertype RARP + const ARP_HTYPE_ETH: u16 = 0x1; // Hardware type Ethernet + const ARP_PTYPE_IP: u16 = 0x0800; // Protocol type IPv4 + const ARP_OP_REQUEST_REV: u16 = 0x0003; // RARP Request opcode + + const IPV4_ADDR_LENGTH: usize = 4; // Size of an IPv4 address + + let mut buf = [0u8; 60]; + + // Ethernet header + buf[0..6].copy_from_slice(&[0xff; MAC_ADDR_LEN]); // This is a broadcast + buf[6..12].copy_from_slice(&self.config.mac); // Src is this NIC + buf[12..14].copy_from_slice(Ð_P_RARP.to_be_bytes()); // This is a RARP packet + + // ARP Header + buf[14..16].copy_from_slice(&ARP_HTYPE_ETH.to_be_bytes()); + buf[16..18].copy_from_slice(&ARP_PTYPE_IP.to_be_bytes()); + buf[18] = MAC_ADDR_LEN as u8; // Hardware address length (ethernet) + buf[19] = IPV4_ADDR_LENGTH as u8; // Protocol address length (IPv4) + // This is a "fake RARP" packet, we don't want to perform a real RARP lookup. + // Thus the content of the next fields is largely irrelevant. Setting source + // hardware address = target hardware address is fine according to RFC 903. + buf[20..22].copy_from_slice(&ARP_OP_REQUEST_REV.to_be_bytes()); + buf[22..28].copy_from_slice(&self.config.mac); // Source hardware address + buf[28..32].copy_from_slice(&[0x00; IPV4_ADDR_LENGTH]); // Source protocol address + buf[32..38].copy_from_slice(&self.config.mac); // Target hardware address + buf[38..42].copy_from_slice(&[0x00; IPV4_ADDR_LENGTH]); // Target protocol address + + buf + } } impl Drop for Net { @@ -870,6 +904,13 @@ impl VirtioDevice for Net { fn set_access_platform(&mut self, access_platform: Arc) { self.common.set_access_platform(access_platform); } + + fn post_migration_announcer(&self) -> std::option::Option> { + Some(Box::new(TapRarpAnnouncer::new( + self.build_rarp_announce(), + self.taps.clone(), + ))) + } } impl Pausable for Net { @@ -898,3 +939,34 @@ impl Snapshottable for Net { } impl Transportable for Net {} impl Migratable for Net {} + +pub struct TapRarpAnnouncer { + announce: [u8; 60], + taps: Vec, +} + +impl TapRarpAnnouncer { + pub fn new(announce: [u8; 60], taps: Vec) -> Self { + Self { announce, taps } + } +} + +impl PostMigrationAnnouncer for TapRarpAnnouncer { + fn announce(&mut self) { + // We have to add a virtio-net header to the announce. + let mut buf = vec![0u8; vnet_hdr_len() + self.announce.len()]; + buf[vnet_hdr_len()..].copy_from_slice(&self.announce); + + for tap in &self.taps { + // SAFETY: `buf.as_ptr()` is valid for `buf.len()` bytes and remains + // valid until the syscall returns. `tap.as_raw_fd()` is a valid TAP fd. + let _ = unsafe { + libc::write( + tap.as_raw_fd(), + buf.as_ptr() as *const libc::c_void, + buf.len(), + ) + }; + } + } +} diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 6ffeadf4cd..49f8d6c5fc 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -16,10 +16,11 @@ use std::num::Wrapping; use std::os::unix::fs::OpenOptionsExt; use std::os::unix::io::{AsRawFd, FromRawFd}; use std::path::{Path, PathBuf}; -use std::result; use std::sync::{Arc, Mutex}; +use std::time::Duration; #[cfg(not(target_arch = "riscv64"))] use std::time::Instant; +use std::{result, thread}; use acpi_tables::sdt::GenericAddress; use acpi_tables::{Aml, aml}; @@ -90,8 +91,8 @@ use vfio_ioctls::{VfioContainer, VfioDevice, VfioDeviceFd}; use virtio_devices::transport::{VirtioPciDevice, VirtioPciDeviceActivator, VirtioTransport}; use virtio_devices::vhost_user::VhostUserConfig; use virtio_devices::{ - AccessPlatformMapping, ActivateError, Block, Endpoint, IommuMapping, VdpaDmaMapping, - VirtioMemMappingSource, + AccessPlatformMapping, ActivateError, Block, Endpoint, IommuMapping, PostMigrationAnnouncer, + VdpaDmaMapping, VirtioMemMappingSource, }; use vm_allocator::{AddressAllocator, SystemAllocator}; use vm_device::dma_mapping::ExternalDmaMapping; @@ -5063,6 +5064,48 @@ impl DeviceManager { self.vfio_container = None; } } + + // Calls the PostMigrationAnnouncers of each device that has one, and schedules + // periodic announcements. Currently only network devices use this to announce + // the new location of the VM to the network. + pub fn post_migration_announce(&self) { + let mut announcers: Vec> = self + .virtio_devices + .iter() + .filter_map(|dev| dev.virtio_device.lock().unwrap().post_migration_announcer()) + .collect(); + + announcers.iter_mut().for_each(|a| a.announce()); + schedule_post_migration_announces(announcers, 4, 50, 100, 450); + } +} + +// We could make this announcer configurable. +fn schedule_post_migration_announces( + mut announcers: Vec>, + rounds: u32, + initial_ms: u64, + step_ms: u64, + max_ms: u64, +) { + if announcers.is_empty() || rounds == 0 { + return; + } + + let _ = thread::Builder::new() + .name("post-migration-announcers".to_string()) + .spawn(move || { + for round in 0..rounds { + // The first announce is done synchronous, thus we sleep at the + // start of the loop. + + let delay = (initial_ms + (round as u64) * step_ms).min(max_ms); + let delay = Duration::from_millis(delay); + thread::sleep(delay); + + announcers.iter_mut().for_each(|a| a.announce()); + } + }); } #[cfg(feature = "ivshmem")] @@ -5406,6 +5449,10 @@ impl Pausable for DeviceManager { } fn resume(&mut self) -> result::Result<(), MigratableError> { + // Before resuming the devices, we active the post migration announcers + // of devices that have one. + self.post_migration_announce(); + for (_, device_node) in self.device_tree.lock().unwrap().iter() { if let Some(migratable) = &device_node.migratable { migratable.lock().unwrap().resume()?;