diff --git a/kvm-bindings/src/lib.rs b/kvm-bindings/src/lib.rs index ec56e597..f3d99d39 100644 --- a/kvm-bindings/src/lib.rs +++ b/kvm-bindings/src/lib.rs @@ -39,3 +39,7 @@ pub use self::arm64::*; mod riscv64; #[cfg(target_arch = "riscv64")] pub use self::riscv64::*; + +// linux defines these based on _BITUL macros and bindgen fails to generate them +pub const KVM_DIRTY_GFN_F_DIRTY: u32 = 0b1; +pub const KVM_DIRTY_GFN_F_RESET: u32 = 0b10; diff --git a/kvm-ioctls/CHANGELOG.md b/kvm-ioctls/CHANGELOG.md index e5df7b76..839f81ea 100644 --- a/kvm-ioctls/CHANGELOG.md +++ b/kvm-ioctls/CHANGELOG.md @@ -2,6 +2,21 @@ ## Upcoming Release +### Added + +- Added `KvmDirtyLogRing` structure to mmap the dirty log ring. +- Added `KVM_DIRTY_GFN_F_DIRTY` and `KVM_DIRTY_GFN_F_RESET` bitflags. +- Added `KvmDirtyLogRing` iterator type for accessing dirty log entries. +- Added `dirty_log_ring` field to `VcpuFd` to access per-vCpu dirty rings. +- Added `dirty_log_bytes` field to `VmFd` to automatically map correct size dirty + rings for vCpus as they are created. +- Added `enable_dirty_log_ring` function on `VmFd` to check corresponding + capabilities and enable KVM's dirty log ring. +- Added `VcpuFd::dirty_log_ring_iter()` to iterate over dirty guest frame numbers. +- Added `VmFd::reset_dirty_rings()` to reset all dirty rings for the VM. + +- Plumb through KVM_CAP_DIRTY_LOG_RING as DirtyLogRing cap. + ## v0.24.0 ### Added diff --git a/kvm-ioctls/src/cap.rs b/kvm-ioctls/src/cap.rs index 67d4eb54..5b192cf0 100644 --- a/kvm-ioctls/src/cap.rs +++ b/kvm-ioctls/src/cap.rs @@ -169,4 +169,5 @@ pub enum Cap { NestedState = KVM_CAP_NESTED_STATE, #[cfg(target_arch = "x86_64")] X2ApicApi = KVM_CAP_X2APIC_API, + DirtyLogRing = KVM_CAP_DIRTY_LOG_RING, } diff --git a/kvm-ioctls/src/ioctls/mod.rs b/kvm-ioctls/src/ioctls/mod.rs index 22cd6067..d7bc5a1f 100644 --- a/kvm-ioctls/src/ioctls/mod.rs +++ b/kvm-ioctls/src/ioctls/mod.rs @@ -10,7 +10,8 @@ use std::os::unix::io::AsRawFd; use std::ptr::{NonNull, null_mut}; use kvm_bindings::{ - KVM_COALESCED_MMIO_PAGE_OFFSET, kvm_coalesced_mmio, kvm_coalesced_mmio_ring, kvm_run, + KVM_COALESCED_MMIO_PAGE_OFFSET, KVM_DIRTY_GFN_F_DIRTY, KVM_DIRTY_GFN_F_RESET, + KVM_DIRTY_LOG_PAGE_OFFSET, kvm_coalesced_mmio, kvm_coalesced_mmio_ring, kvm_dirty_gfn, kvm_run, }; use vmm_sys_util::errno; @@ -29,6 +30,104 @@ pub mod vm; /// is otherwise a direct mapping to Result. pub type Result = std::result::Result; +/// A wrapper around the KVM dirty log ring page. +#[derive(Debug)] +pub(crate) struct KvmDirtyLogRing { + /// Next potentially dirty guest frame number slot index + next_dirty: u64, + /// Memory-mapped array of dirty guest frame number entries + gfns: NonNull, + /// Ring size mask (size-1) for efficient modulo operations + mask: u64, +} + +// SAFETY: TBD +unsafe impl Send for KvmDirtyLogRing {} +unsafe impl Sync for KvmDirtyLogRing {} +impl KvmDirtyLogRing { + /// Maps the KVM dirty log ring from the vCPU file descriptor. + /// + /// # Arguments + /// * `fd` - vCPU file descriptor to mmap from. + /// * `size` - Size of memory region in bytes. + pub(crate) fn mmap_from_fd(fd: &F, bytes: usize) -> Result { + // SAFETY: We trust the sysconf libc function and we're calling it + // with a correct parameter. + let page_size = match unsafe { libc::sysconf(libc::_SC_PAGESIZE) } { + -1 => return Err(errno::Error::last()), + ps => ps as usize, + }; + + let offset = page_size * KVM_DIRTY_LOG_PAGE_OFFSET as usize; + + if bytes % std::mem::size_of::() != 0 { + // Size of dirty ring in bytes must be multiples of slot size + return Err(errno::Error::new(libc::EINVAL)); + } + let slots = bytes / std::mem::size_of::(); + if !slots.is_power_of_two() { + // Number of slots must be power of two + return Err(errno::Error::new(libc::EINVAL)); + } + + // SAFETY: KVM guarantees that there is a page at offset + // KVM_DIRTY_LOG_PAGE_OFFSET * PAGE_SIZE if the appropriate + // capability is available. If it is not, the call will simply + // fail. + let gfns = unsafe { + NonNull::::new(libc::mmap( + null_mut(), + bytes, + libc::PROT_READ | libc::PROT_WRITE, + libc::MAP_SHARED, + fd.as_raw_fd(), + offset as i64, + ) as *mut kvm_dirty_gfn) + .filter(|addr| addr.as_ptr() != libc::MAP_FAILED as *mut kvm_dirty_gfn) + .ok_or_else(|| errno::Error::last())? + }; + return Ok(Self { + next_dirty: 0, + gfns, + mask: (slots - 1) as u64, + }); + } +} + +impl Drop for KvmDirtyLogRing { + fn drop(&mut self) { + // SAFETY: This is safe because we mmap the page ourselves, and nobody + // else is holding a reference to it. + unsafe { + libc::munmap( + self.gfns.as_ptr().cast(), + (self.mask + 1) as usize * std::mem::size_of::(), + ); + } + } +} + +impl Iterator for KvmDirtyLogRing { + type Item = (u32, u64); + fn next(&mut self) -> Option { + let i = self.next_dirty & self.mask; + unsafe { + let gfn_ptr = self.gfns.add(i as usize).as_ptr(); + let gfn = gfn_ptr.read_volatile(); + if gfn.flags & KVM_DIRTY_GFN_F_DIRTY == 0 { + // next_dirty stays the same, it will become the next dirty element + return None; + } else { + self.next_dirty += 1; + let mut updated_gfn = gfn; + updated_gfn.flags ^= KVM_DIRTY_GFN_F_RESET; + gfn_ptr.write_volatile(updated_gfn); + return Some((gfn.slot, gfn.offset)); + } + } + } +} + /// A wrapper around the coalesced MMIO ring page. #[derive(Debug)] pub(crate) struct KvmCoalescedIoRing { diff --git a/kvm-ioctls/src/ioctls/vcpu.rs b/kvm-ioctls/src/ioctls/vcpu.rs index a1002aa5..0428babc 100644 --- a/kvm-ioctls/src/ioctls/vcpu.rs +++ b/kvm-ioctls/src/ioctls/vcpu.rs @@ -16,7 +16,7 @@ use libc::EINVAL; use std::fs::File; use std::os::unix::io::{AsRawFd, RawFd}; -use crate::ioctls::{KvmCoalescedIoRing, KvmRunWrapper, Result}; +use crate::ioctls::{KvmCoalescedIoRing, KvmDirtyLogRing, KvmRunWrapper, Result}; use crate::kvm_ioctls::*; use vmm_sys_util::errno; use vmm_sys_util::ioctl::{ioctl, ioctl_with_mut_ref, ioctl_with_ref}; @@ -197,6 +197,9 @@ pub struct VcpuFd { kvm_run_ptr: KvmRunWrapper, /// A pointer to the coalesced MMIO page coalesced_mmio_ring: Option, + /// A pointer to the dirty log ring + #[allow(unused)] + dirty_log_ring: Option, } /// KVM Sync Registers used to tell KVM which registers to sync @@ -2104,6 +2107,37 @@ impl VcpuFd { } } + /// Gets the dirty log ring iterator if one is mapped. + /// + /// Returns an iterator over dirty guest frame numbers as (slot, offset) tuples. + /// Returns `None` if no dirty log ring has been mapped via [`map_dirty_log_ring`](VcpuFd::map_dirty_log_ring). + /// + /// # Returns + /// + /// An optional iterator over the dirty log ring entries. + /// + /// # Example + /// + /// ```no_run + /// # use kvm_ioctls::Kvm; + /// # use kvm_ioctls::Cap; + /// let kvm = Kvm::new().unwrap(); + /// let vm = kvm.create_vm().unwrap(); + /// vm.enable_dirty_log_ring(None).unwrap(); + /// let mut vcpu = vm.create_vcpu(0).unwrap(); + /// if kvm.check_extension(Cap::DirtyLogRing) { + /// if let Some(mut iter) = vcpu.dirty_log_ring_iter() { + /// for (slot, offset) in iter { + /// println!("Dirty page in slot {} at offset {}", slot, offset); + /// } + /// } + /// } + /// ``` + #[cfg(target_arch = "x86_64")] + pub fn dirty_log_ring_iter(&mut self) -> Option> { + self.dirty_log_ring.as_mut() + } + /// Maps the coalesced MMIO ring page. This allows reading entries from /// the ring via [`coalesced_mmio_read()`](VcpuFd::coalesced_mmio_read). /// @@ -2159,11 +2193,16 @@ impl VcpuFd { /// This should not be exported as a public function because the preferred way is to use /// `create_vcpu` from `VmFd`. The function cannot be part of the `VcpuFd` implementation because /// then it would be exported with the public `VcpuFd` interface. -pub fn new_vcpu(vcpu: File, kvm_run_ptr: KvmRunWrapper) -> VcpuFd { +pub fn new_vcpu( + vcpu: File, + kvm_run_ptr: KvmRunWrapper, + dirty_log_ring: Option, +) -> VcpuFd { VcpuFd { vcpu, kvm_run_ptr, coalesced_mmio_ring: None, + dirty_log_ring: dirty_log_ring, } } diff --git a/kvm-ioctls/src/ioctls/vm.rs b/kvm-ioctls/src/ioctls/vm.rs index 1b58c243..19ea6bca 100644 --- a/kvm-ioctls/src/ioctls/vm.rs +++ b/kvm-ioctls/src/ioctls/vm.rs @@ -18,7 +18,7 @@ use crate::ioctls::device::DeviceFd; use crate::ioctls::device::new_device; use crate::ioctls::vcpu::VcpuFd; use crate::ioctls::vcpu::new_vcpu; -use crate::ioctls::{KvmRunWrapper, Result}; +use crate::ioctls::{KvmDirtyLogRing, KvmRunWrapper, Result}; use crate::kvm_ioctls::*; use vmm_sys_util::errno; use vmm_sys_util::eventfd::EventFd; @@ -59,6 +59,7 @@ impl From for u64 { pub struct VmFd { vm: File, run_size: usize, + dirty_ring_bytes: usize, } impl VmFd { @@ -1214,7 +1215,15 @@ impl VmFd { let kvm_run_ptr = KvmRunWrapper::mmap_from_fd(&vcpu, self.run_size)?; - Ok(new_vcpu(vcpu, kvm_run_ptr)) + let dirty_log_ring = { + if self.dirty_ring_bytes > 0 { + Some(KvmDirtyLogRing::mmap_from_fd(&vcpu, self.dirty_ring_bytes)?) + } else { + None + } + }; + + Ok(new_vcpu(vcpu, kvm_run_ptr, dirty_log_ring)) } /// Creates a VcpuFd object from a vcpu RawFd. @@ -1250,7 +1259,14 @@ impl VmFd { // SAFETY: we trust the kernel and verified parameters let vcpu = unsafe { File::from_raw_fd(fd) }; let kvm_run_ptr = KvmRunWrapper::mmap_from_fd(&vcpu, self.run_size)?; - Ok(new_vcpu(vcpu, kvm_run_ptr)) + let dirty_log_ring = { + if self.dirty_ring_bytes > 0 { + Some(KvmDirtyLogRing::mmap_from_fd(&vcpu, self.dirty_ring_bytes)?) + } else { + None + } + }; + Ok(new_vcpu(vcpu, kvm_run_ptr, dirty_log_ring)) } /// Creates an emulated device in the kernel. @@ -1915,6 +1931,108 @@ impl VmFd { Ok(()) } + /// Enables KVM's dirty log ring for new vCPUs created on this VM. Checks required capabilities and returns + /// `true` if the ring needs to be used together with a backup bitmap `KVM_GET_DIRTY_LOG`. Takes optional + /// dirty ring size as bytes, if not supplied, will use maximum supported dirty ring size. Enabling the dirty + /// log ring is only allowed before any vCPU was created on the VmFd. + /// # Arguments + /// + /// * `bytes` - Size of the dirty log ring in bytes. Needs to be multiple of `std::mem::size_of::()` + /// and power of two. + #[cfg(target_arch = "x86_64")] + pub fn enable_dirty_log_ring(&self, bytes: Option) -> Result { + // Check if requested size is larger than 0 + if let Some(sz) = bytes { + if sz <= 0 + || !(sz as u32).is_power_of_two() + || (sz as usize % std::mem::size_of::() == 0) + { + return Err(errno::Error::new(libc::EINVAL)); + } + } + + let (dirty_ring_cap, max_bytes, bitmap) = { + // Check if KVM_CAP_DIRTY_LOG_RING_ACQ_REL is available, enable if possible + let acq_rel_sz = self.check_extension_raw(KVM_CAP_DIRTY_LOG_RING_ACQ_REL.into()); + if acq_rel_sz > 0 { + if self.check_extension_raw(KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP.into()) != 0 { + (KVM_CAP_DIRTY_LOG_RING_ACQ_REL, acq_rel_sz, true) + } else { + (KVM_CAP_DIRTY_LOG_RING_ACQ_REL, acq_rel_sz, false) + } + } else { + let sz = self.check_extension_raw(KVM_CAP_DIRTY_LOG_RING.into()); + if sz > 0 { + (KVM_CAP_DIRTY_LOG_RING, sz, false) + } else { + (0, 0, false) + } + } + }; + + if dirty_ring_cap == 0 { + // Neither KVM_CAP_DIRTY_LOG_RING nor KVM_CAP_DIRTY_LOG_RING_ACQ_REL are available + return Err(errno::Error::new(libc::EOPNOTSUPP)); + } + + let cap_ring_size = bytes.unwrap_or(max_bytes); + + // Check if supplied size is larger than what the kernel supports + if cap_ring_size > max_bytes { + return Err(errno::Error::new(libc::EINVAL)); + } + + // Enable dirty rings with _ACQ_REL if supported, or without otherwise + let ar_ring_cap = kvm_enable_cap { + cap: dirty_ring_cap, + args: [cap_ring_size as u64, 0, 0, 0], + ..Default::default() + }; + + // Enable the ring cap first + self.enable_cap(&ar_ring_cap)?; + + if bitmap { + let with_bitmap_cap = kvm_enable_cap { + cap: KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP, + ..Default::default() + }; + + // Enable backup bitmap + self.enable_cap(&with_bitmap_cap)?; + } + + Ok(bitmap) + } + + /// Resets all vCPU's dirty log rings. This notifies the kernel that pages have been harvested + /// from the dirty ring and the corresponding pages can be reprotected. + /// + /// # Example + /// + /// ```rust + /// # extern crate kvm_ioctls; + /// # use kvm_ioctls::{Cap, Kvm}; + /// let kvm = Kvm::new().unwrap(); + /// let vm = kvm.create_vm().unwrap(); + /// vm.enable_dirty_log_ring(None).unwrap(); + /// if kvm.check_extension(Cap::DirtyLogRing) { + /// vm.reset_dirty_rings().unwrap(); + /// } + /// ``` + /// + #[cfg(target_arch = "x86_64")] + pub fn reset_dirty_rings(&self) -> Result { + // SAFETY: Safe because we know that our file is a KVM fd and that the request is one of + // the ones defined by kernel. + let ret = unsafe { ioctl(self, KVM_RESET_DIRTY_RINGS()) }; + if ret < 0 { + Err(errno::Error::last()) + } else { + Ok(ret) + } + } + /// Sets a specified piece of vm configuration and/or state. /// /// See the documentation for `KVM_SET_DEVICE_ATTR` in @@ -2011,7 +2129,11 @@ impl VmFd { /// `create_vm` from `Kvm`. The function cannot be part of the `VmFd` implementation because /// then it would be exported with the public `VmFd` interface. pub fn new_vmfd(vm: File, run_size: usize) -> VmFd { - VmFd { vm, run_size } + VmFd { + vm, + run_size, + dirty_ring_bytes: 0, + } } impl AsRawFd for VmFd { @@ -2601,6 +2723,7 @@ mod tests { let faulty_vm_fd = VmFd { vm: unsafe { File::from_raw_fd(-2) }, run_size: 0, + dirty_ring_bytes: 0, }; let invalid_mem_region = kvm_userspace_memory_region { diff --git a/kvm-ioctls/src/kvm_ioctls.rs b/kvm-ioctls/src/kvm_ioctls.rs index 43898ba3..b9620170 100644 --- a/kvm-ioctls/src/kvm_ioctls.rs +++ b/kvm-ioctls/src/kvm_ioctls.rs @@ -220,6 +220,8 @@ ioctl_io_nr!(KVM_SET_TSC_KHZ, KVMIO, 0xa2); /* Available with KVM_CAP_GET_TSC_KHZ */ #[cfg(target_arch = "x86_64")] ioctl_io_nr!(KVM_GET_TSC_KHZ, KVMIO, 0xa3); +/* Available with KVM_CAP_DIRTY_LOG_RING */ +ioctl_io_nr!(KVM_RESET_DIRTY_RINGS, KVMIO, 0xc7); /* Available with KVM_CAP_ENABLE_CAP */ #[cfg(not(any(target_arch = "aarch64", target_arch = "riscv64")))]