diff --git a/CHANGELOG.md b/CHANGELOG.md index 2dada8f6cbc..0b308518b95 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,10 @@ and this project adheres to ### Added +- [#5463](https://github.com/firecracker-microvm/firecracker/pull/5463): Added + support for `virtio-pmem` devices. See [documentation](docs/pmem.md) for more + information. + ### Changed ### Deprecated diff --git a/docs/pmem.md b/docs/pmem.md new file mode 100644 index 00000000000..355ecf2ee83 --- /dev/null +++ b/docs/pmem.md @@ -0,0 +1,174 @@ +# Using the Firecracker `virtio-pmem` device + +## What is a persistent memory device + +Persistent memory is a type of non-volatile, CPU accessible (with usual +load/store instructions) memory that does not lose its content on power loss. In +other words all writes to the memory persist over the power cycle. In hardware +this known as NVDIMM memory (Non Volatile Double Inline Memory Module). + +## What is a `virtio-pmem` device: + +[`virtio-pmem`](https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-68900019) +is a device which emulates a persistent memory device without requiring a +physical NVDIMM device be present on the host system. `virtio-pmem` is backed by +a memory mapped file on the host side and is exposed to the guest kernel as an +region in the guest physical memory. This allows the guest to directly access +the host memory pages without a need to use guest driver or interact with VMM. +From guest user-space perspective `virtio-pmem` devices are presented as normal +block device like `/dev/pmem0`. This allows `virtio-pmem` to be used as rootfs +device and make VM boot from it. + +> [!NOTE] +> +> Since `virtio-pmem` is located fully in memory, when used as a block device +> there is no need to use guest page cache for it's operations. This behaviour +> can be configured by using `DAX` feature of the kernel. +> +> - To mount a device with `DAX` add `--flags=dax` to the `mount` command. +> - To configure a root device with `DAX` append `rootflags=dax` to the kernel +> arguments. +> +> `DAX` support is not uniform for all file systems. Check the documentation for +> the file system you want to use before enabling `DAX`. + +## Prerequisites + +In order to use `virtio-pmem` device, guest kernel needs to built with support +for it. The full list of configuration options needed for `virtio-pmem` and +`DAX`: + +``` +# Needed for DAX on aarch64. Will be ignored on x86_64 +CONFIG_ARM64_PMEM=y + +CONFIG_DEVICE_MIGRATION=y +CONFIG_ZONE_DEVICE=y +CONFIG_VIRTIO_PMEM=y +CONFIG_LIBNVDIMM=y +CONFIG_BLK_DEV_PMEM=y +CONFIG_ND_CLAIM=y +CONFIG_ND_BTT=y +CONFIG_BTT=y +CONFIG_ND_PFN=y +CONFIG_NVDIMM_PFN=y +CONFIG_NVDIMM_DAX=y +CONFIG_OF_PMEM=y +CONFIG_NVDIMM_KEYS=y +CONFIG_DAX=y +CONFIG_DEV_DAX=y +CONFIG_DEV_DAX_PMEM=y +CONFIG_DEV_DAX_KMEM=y +CONFIG_FS_DAX=y +CONFIG_FS_DAX_PMD=y +``` + +## Configuration + +Firecracker implementation exposes these config options for the `virtio-pmem` +device: + +- `id` - id of the device for internal use +- `path_on_host` - path to the backing file +- `root_device` - toggle to use this device as root device. Device will be + marked as `rw` in the kernel arguments +- `read_only` - tells Firecracker to `mmap` the backing file in read-only mode. + If this device is also configured as `root_device`, it will be marked as `ro` + in the kernel arguments + +> [!NOTE] +> +> Devices will be exposed to the guest in the order in which they are configured +> with sequential names in the for `/dev/pmem{N}` like: `/dev/pmem0`, +> `/dev/pmem1` ... + +> [!WARNING] +> +> Setting `virtio-pmem` device to `read-only` mode can lead to VM shutting down +> on any attempt to write to the device. This is because from guest kernel +> perspective `virtio-pmem` is always `read-write` capable. Use `read-only` mode +> only if you want to ensure the underlying file is never written to. +> +> To mount the `pmem` device with `read-only` options add `-o ro` to the `mount` +> command. +> +> The exact behaviour differs per platform: +> +> - x86_64 - if KVM is able to decode the write instruction used by the guest, +> it will return a MMIO_WRITE to the Firecracker where it will be discarded +> and the warning log will be printed. +> - aarch64 - the instruction emulation is much stricter. Writes will result in +> an internal KVM error which will be returned to Firecracker in a form of an +> `ENOSYS` error. This will make Firecracker stop the VM with appropriate log +> message. + +> [!WARNING] +> +> `virtio-pmem` requires for the guest exposed memory region to be 2MB aligned. +> This requirement is transitively carried to the backing file of the +> `virtio-pmem`. Firecracker allows users to configure `virtio-pmem` with +> backing file of any size and fills the memory gap between the end of the file +> and the 2MB boundary with empty `PRIVATE | ANONYMOUS` memory pages. Users must +> be careful to not write to this memory gap since it will not be synchronized +> with backing file. This is not an issue if `virtio-pmem` is configured in +> `read-only` mode. + +### Config file + +Configuration of the `virtio-pmem` device from config file follows similar +pattern to `virtio-block` section. Here is an example configuration for a single +`virtio-pmem` device: + +```json +"pmem": [ + { + "id": "pmem0", + "path_on_host": "./some_file", + "root_device": true, + "read_only": false + } +] +``` + +### API + +Similar to other devices `virtio-pmem` can be configured with API calls. An +example of configuration request: + +```console +curl --unix-socket $socket_location -i \ + -X PUT 'http://localhost/pmem/pmem0' \ + -H 'Accept: application/json' \ + -H 'Content-Type: application/json' \ + -d "{ + \"id\": \"pmem0\", + \"path_on_host\": \"./some_file\", + \"root_device\": true, + \"read_only\": false + }" +``` + +## Security + +`virtio-pmem` can used for sharing of underlying backing file between multiple +VMs by providing same backing file to `virtio-pmem` devices of corresponding +VMs. This scenario imposes a security risk of side channel attacks between VMs. +Users are encouraged to evaluate risks before using `virtio-pmem` for such +scenarios. + +## Snapshot support + +`virtio-pmem` works with snapshot functionality of Firecracker. Snapshot will +contain the configuration options provided by the user. During restoration +process, Firecracker will attempt to restore `virtio-pmem` device by opening +same backing file as it was configured in the first place. This means all +`virtio-pmem` backing files should be present in the same locations during +restore as they were during initial `virtio-pmem` configuration. + +## Performance + +Event thought `virtio-pmem` allows for the direct access of host pages from the +guest, the performance of the first access of each page will suffer from the +internal KVM page fault which will have to set up Guest physical address to Host +Virtual address translation. Consecutive accesses will not need to go through +this process again. diff --git a/resources/seccomp/aarch64-unknown-linux-musl.json b/resources/seccomp/aarch64-unknown-linux-musl.json index 433528b8f29..d81a1012599 100644 --- a/resources/seccomp/aarch64-unknown-linux-musl.json +++ b/resources/seccomp/aarch64-unknown-linux-musl.json @@ -215,6 +215,19 @@ "syscall": "madvise", "comment": "Used by the VirtIO balloon device and by musl for some customer workloads. It is also used by aws-lc during random number generation. They setup a memory page that mark with MADV_WIPEONFORK to be able to detect forks. They also call it with -1 to see if madvise is supported in certain platforms." }, + { + "syscall": "msync", + "comment": "Used by the VirtIO pmem device to sync the file content with the backing file.", + "args": [ + { + "index": 2, + "type": "dword", + "op": "eq", + "val": 4, + "comment": "libc::MS_SYNC" + } + ] + }, { "syscall": "mmap", "comment": "Used by the VirtIO balloon device", diff --git a/resources/seccomp/x86_64-unknown-linux-musl.json b/resources/seccomp/x86_64-unknown-linux-musl.json index 14f2a26bafd..66c986495fb 100644 --- a/resources/seccomp/x86_64-unknown-linux-musl.json +++ b/resources/seccomp/x86_64-unknown-linux-musl.json @@ -215,6 +215,19 @@ "syscall": "madvise", "comment": "Used by the VirtIO balloon device and by musl for some customer workloads. It is also used by aws-lc during random number generation. They setup a memory page that mark with MADV_WIPEONFORK to be able to detect forks. They also call it with -1 to see if madvise is supported in certain platforms." }, + { + "syscall": "msync", + "comment": "Used by the VirtIO pmem device to sync the file content with the backing file.", + "args": [ + { + "index": 2, + "type": "dword", + "op": "eq", + "val": 4, + "comment": "libc::MS_SYNC" + } + ] + }, { "syscall": "mmap", "comment": "Used by the VirtIO balloon device", diff --git a/src/firecracker/src/api_server/parsed_request.rs b/src/firecracker/src/api_server/parsed_request.rs index d16be7a63be..9f1ab870061 100644 --- a/src/firecracker/src/api_server/parsed_request.rs +++ b/src/firecracker/src/api_server/parsed_request.rs @@ -24,6 +24,7 @@ use super::request::machine_configuration::{ use super::request::metrics::parse_put_metrics; use super::request::mmds::{parse_get_mmds, parse_patch_mmds, parse_put_mmds}; use super::request::net::{parse_patch_net, parse_put_net}; +use super::request::pmem::parse_put_pmem; use super::request::snapshot::{parse_patch_vm_state, parse_put_snapshot}; use super::request::version::parse_get_version; use super::request::vsock::parse_put_vsock; @@ -90,6 +91,7 @@ impl TryFrom<&Request> for ParsedRequest { (Method::Put, "boot-source", Some(body)) => parse_put_boot_source(body), (Method::Put, "cpu-config", Some(body)) => parse_put_cpu_config(body), (Method::Put, "drives", Some(body)) => parse_put_drive(body, path_tokens.next()), + (Method::Put, "pmem", Some(body)) => parse_put_pmem(body, path_tokens.next()), (Method::Put, "logger", Some(body)) => parse_put_logger(body), (Method::Put, "serial", Some(body)) => parse_put_serial(body), (Method::Put, "machine-config", Some(body)) => parse_put_machine_config(body), diff --git a/src/firecracker/src/api_server/request/mod.rs b/src/firecracker/src/api_server/request/mod.rs index a406842d0a6..276a89d5a4e 100644 --- a/src/firecracker/src/api_server/request/mod.rs +++ b/src/firecracker/src/api_server/request/mod.rs @@ -13,6 +13,7 @@ pub mod machine_configuration; pub mod metrics; pub mod mmds; pub mod net; +pub mod pmem; pub mod serial; pub mod snapshot; pub mod version; diff --git a/src/firecracker/src/api_server/request/pmem.rs b/src/firecracker/src/api_server/request/pmem.rs new file mode 100644 index 00000000000..dc538a0d5fc --- /dev/null +++ b/src/firecracker/src/api_server/request/pmem.rs @@ -0,0 +1,75 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use vmm::logger::{IncMetric, METRICS}; +use vmm::rpc_interface::VmmAction; +use vmm::vmm_config::pmem::PmemConfig; + +use super::super::parsed_request::{ParsedRequest, RequestError, checked_id}; +use super::{Body, StatusCode}; + +pub(crate) fn parse_put_pmem( + body: &Body, + id_from_path: Option<&str>, +) -> Result { + METRICS.put_api_requests.pmem_count.inc(); + let id = if let Some(id) = id_from_path { + checked_id(id)? + } else { + METRICS.put_api_requests.pmem_fails.inc(); + return Err(RequestError::EmptyID); + }; + + let device_cfg = serde_json::from_slice::(body.raw()).inspect_err(|_| { + METRICS.put_api_requests.pmem_fails.inc(); + })?; + + if id != device_cfg.id { + METRICS.put_api_requests.pmem_fails.inc(); + Err(RequestError::Generic( + StatusCode::BadRequest, + "The id from the path does not match the id from the body!".to_string(), + )) + } else { + Ok(ParsedRequest::new_sync(VmmAction::InsertPmemDevice( + device_cfg, + ))) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::api_server::parsed_request::tests::vmm_action_from_request; + + #[test] + fn test_parse_put_pmem_request() { + parse_put_pmem(&Body::new("invalid_payload"), None).unwrap_err(); + parse_put_pmem(&Body::new("invalid_payload"), Some("id")).unwrap_err(); + + let body = r#"{ + "id": "bar", + }"#; + parse_put_pmem(&Body::new(body), Some("1")).unwrap_err(); + let body = r#"{ + "foo": "1", + }"#; + parse_put_pmem(&Body::new(body), Some("1")).unwrap_err(); + + let body = r#"{ + "id": "1000", + "path_on_host": "dummy", + "root_device": true, + "read_only": true + }"#; + let r = vmm_action_from_request(parse_put_pmem(&Body::new(body), Some("1000")).unwrap()); + + let expected_config = PmemConfig { + id: "1000".to_string(), + path_on_host: "dummy".to_string(), + root_device: true, + read_only: true, + }; + assert_eq!(r, VmmAction::InsertPmemDevice(expected_config)); + } +} diff --git a/src/firecracker/swagger/firecracker.yaml b/src/firecracker/swagger/firecracker.yaml index 598db98229e..f775f6aa3f3 100644 --- a/src/firecracker/swagger/firecracker.yaml +++ b/src/firecracker/swagger/firecracker.yaml @@ -282,6 +282,38 @@ paths: schema: $ref: "#/definitions/Error" + /pmem/{id}: + put: + summary: Creates or updates a pmem device. Pre-boot only. + description: + Creates new pmem device with ID specified by id parameter. + If a pmem device with the specified ID already exists, updates its state based on new input. + Will fail if update is not possible. + operationId: putGuestPmemByID + parameters: + - name: id + in: path + description: The id of the guest pmem device + required: true + type: string + - name: body + in: body + description: Guest pmem device properties + required: true + schema: + $ref: "#/definitions/Pmem" + responses: + 204: + description: Pmem device is created/updated + 400: + description: Pmem device cannot be created/updated due to bad input + schema: + $ref: "#/definitions/Error" + default: + description: Internal server error. + schema: + $ref: "#/definitions/Error" + /logger: put: summary: Initializes the logger by specifying a named pipe or a file for the logs output. @@ -934,6 +966,20 @@ definitions: Path to the socket of vhost-user-block backend. This field is required for vhost-user-block config should be omitted for virtio-block configuration. + Pmem: + type: object + required: + - id + - is_root_device + - shared + properties: + id: + type: string + is_root_device: + type: boolean + shared: + type: boolean + Error: type: object properties: diff --git a/src/vmm/src/arch/aarch64/layout.rs b/src/vmm/src/arch/aarch64/layout.rs index c4937e43c92..b50f8fb40e6 100644 --- a/src/vmm/src/arch/aarch64/layout.rs +++ b/src/vmm/src/arch/aarch64/layout.rs @@ -139,3 +139,5 @@ pub const MEM_64BIT_DEVICES_START: u64 = MMIO64_MEM_START; pub const MEM_64BIT_DEVICES_SIZE: u64 = MMIO64_MEM_SIZE; /// First address past the 64-bit MMIO gap pub const FIRST_ADDR_PAST_64BITS_MMIO: u64 = MMIO64_MEM_START + MMIO64_MEM_SIZE; +/// Size of the memory past 64-bit MMIO gap +pub const PAST_64BITS_MMIO_SIZE: u64 = 512 << 30; diff --git a/src/vmm/src/arch/mod.rs b/src/vmm/src/arch/mod.rs index 6d33ce461b9..435ed5483ea 100644 --- a/src/vmm/src/arch/mod.rs +++ b/src/vmm/src/arch/mod.rs @@ -21,14 +21,7 @@ pub use aarch64::vm::{ArchVm, ArchVmError, VmState}; #[cfg(target_arch = "aarch64")] pub use aarch64::{ ConfigurationError, arch_memory_regions, configure_system_for_boot, get_kernel_start, - initrd_load_addr, layout::BOOT_DEVICE_MEM_START, layout::CMDLINE_MAX_SIZE, - layout::GSI_LEGACY_END, layout::GSI_LEGACY_NUM, layout::GSI_LEGACY_START, layout::GSI_MSI_END, - layout::GSI_MSI_NUM, layout::GSI_MSI_START, layout::MEM_32BIT_DEVICES_SIZE, - layout::MEM_32BIT_DEVICES_START, layout::MEM_64BIT_DEVICES_SIZE, - layout::MEM_64BIT_DEVICES_START, layout::MMIO32_MEM_SIZE, layout::MMIO32_MEM_START, - layout::PCI_MMCONFIG_SIZE, layout::PCI_MMCONFIG_START, - layout::PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, layout::RTC_MEM_START, layout::SERIAL_MEM_START, - layout::SPI_START, layout::SYSTEM_MEM_SIZE, layout::SYSTEM_MEM_START, load_kernel, + initrd_load_addr, layout::*, load_kernel, }; /// Module for x86_64 related functionality. @@ -45,14 +38,7 @@ pub use x86_64::vm::{ArchVm, ArchVmError, VmState}; #[cfg(target_arch = "x86_64")] pub use crate::arch::x86_64::{ ConfigurationError, arch_memory_regions, configure_system_for_boot, get_kernel_start, - initrd_load_addr, layout::APIC_ADDR, layout::BOOT_DEVICE_MEM_START, layout::CMDLINE_MAX_SIZE, - layout::GSI_LEGACY_END, layout::GSI_LEGACY_NUM, layout::GSI_LEGACY_START, layout::GSI_MSI_END, - layout::GSI_MSI_NUM, layout::GSI_MSI_START, layout::IOAPIC_ADDR, - layout::MEM_32BIT_DEVICES_SIZE, layout::MEM_32BIT_DEVICES_START, - layout::MEM_64BIT_DEVICES_SIZE, layout::MEM_64BIT_DEVICES_START, layout::MMIO32_MEM_SIZE, - layout::MMIO32_MEM_START, layout::PCI_MMCONFIG_SIZE, layout::PCI_MMCONFIG_START, - layout::PCI_MMIO_CONFIG_SIZE_PER_SEGMENT, layout::SYSTEM_MEM_SIZE, layout::SYSTEM_MEM_START, - load_kernel, + initrd_load_addr, layout::*, load_kernel, }; /// Types of devices that can get attached to this platform. diff --git a/src/vmm/src/arch/x86_64/layout.rs b/src/vmm/src/arch/x86_64/layout.rs index 34ad343af2a..9a95e2b06bc 100644 --- a/src/vmm/src/arch/x86_64/layout.rs +++ b/src/vmm/src/arch/x86_64/layout.rs @@ -132,3 +132,5 @@ pub const MEM_64BIT_DEVICES_START: u64 = MMIO64_MEM_START; pub const MEM_64BIT_DEVICES_SIZE: u64 = MMIO64_MEM_SIZE; /// First address past the 64-bit MMIO gap pub const FIRST_ADDR_PAST_64BITS_MMIO: u64 = MMIO64_MEM_START + MMIO64_MEM_SIZE; +/// Size of the memory past 64-bit MMIO gap +pub const PAST_64BITS_MMIO_SIZE: u64 = 512 << 30; diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 6bd81c46f18..0dea14207ad 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -33,6 +33,7 @@ use crate::devices::acpi::vmgenid::VmGenIdError; use crate::devices::virtio::balloon::Balloon; use crate::devices::virtio::block::device::Block; use crate::devices::virtio::net::Net; +use crate::devices::virtio::pmem::device::Pmem; use crate::devices::virtio::rng::Entropy; use crate::devices::virtio::vsock::{Vsock, VsockUnixBackend}; #[cfg(feature = "gdb")] @@ -68,6 +69,8 @@ pub enum StartMicrovmError { CreateGuestConfig(#[from] GuestConfigError), /// Cannot create network device: {0} CreateNetDevice(crate::devices::virtio::net::NetError), + /// Cannot create pmem device: {0} + CreatePmemDevice(#[from] crate::devices::virtio::pmem::device::PmemError), /// Cannot create RateLimiter: {0} CreateRateLimiter(io::Error), /// Error creating legacy device: {0} @@ -219,6 +222,13 @@ pub fn build_microvm_for_boot( vm_resources.net_builder.iter(), event_manager, )?; + attach_pmem_devices( + &mut device_manager, + &vm, + &mut boot_cmdline, + vm_resources.pmem.devices.iter(), + event_manager, + )?; if let Some(unix_vsock) = vm_resources.vsock.get() { attach_unixsock_vsock_device( @@ -609,6 +619,34 @@ fn attach_net_devices<'a, I: Iterator>> + Debug>( Ok(()) } +fn attach_pmem_devices<'a, I: Iterator>> + Debug>( + device_manager: &mut DeviceManager, + vm: &Arc, + cmdline: &mut LoaderKernelCmdline, + pmem_devices: I, + event_manager: &mut EventManager, +) -> Result<(), StartMicrovmError> { + for (i, device) in pmem_devices.enumerate() { + let id = { + let mut locked_dev = device.lock().expect("Poisoned lock"); + if locked_dev.config.root_device { + cmdline.insert_str(format!("root=/dev/pmem{i}"))?; + match locked_dev.config.read_only { + true => cmdline.insert_str("ro")?, + false => cmdline.insert_str("rw")?, + } + } + locked_dev.alloc_region(vm.as_ref()); + locked_dev.set_mem_region(vm.as_ref())?; + locked_dev.config.id.to_string() + }; + + event_manager.add_subscriber(device.clone()); + device_manager.attach_virtio_device(vm, id, device.clone(), cmdline, false)?; + } + Ok(()) +} + fn attach_unixsock_vsock_device( device_manager: &mut DeviceManager, vm: &Arc, @@ -655,6 +693,7 @@ pub(crate) mod tests { use crate::vmm_config::drive::{BlockBuilder, BlockDeviceConfig}; use crate::vmm_config::entropy::{EntropyDeviceBuilder, EntropyDeviceConfig}; use crate::vmm_config::net::{NetBuilder, NetworkInterfaceConfig}; + use crate::vmm_config::pmem::{PmemBuilder, PmemConfig}; use crate::vmm_config::vsock::tests::default_config; use crate::vmm_config::vsock::{VsockBuilder, VsockDeviceConfig}; use crate::vstate::vm::tests::setup_vm_with_memory; @@ -762,7 +801,9 @@ pub(crate) mod tests { socket: None, }; - block_dev_configs.insert(block_device_config).unwrap(); + block_dev_configs + .insert(block_device_config, false) + .unwrap(); } attach_block_devices( @@ -873,6 +914,34 @@ pub(crate) mod tests { ); } + pub(crate) fn insert_pmem_devices( + vmm: &mut Vmm, + cmdline: &mut Cmdline, + event_manager: &mut EventManager, + configs: Vec, + ) -> Vec { + let mut builder = PmemBuilder::default(); + let mut files = Vec::new(); + for mut config in configs { + let tmp_file = TempFile::new().unwrap(); + tmp_file.as_file().set_len(0x20_0000).unwrap(); + let tmp_file_path = tmp_file.as_path().to_str().unwrap().to_string(); + files.push(tmp_file); + config.path_on_host = tmp_file_path; + builder.build(config, false).unwrap(); + } + + attach_pmem_devices( + &mut vmm.device_manager, + &vmm.vm, + cmdline, + builder.devices.iter(), + event_manager, + ) + .unwrap(); + files + } + #[cfg(target_arch = "x86_64")] pub(crate) fn insert_vmgenid_device(vmm: &mut Vmm) { vmm.device_manager @@ -1120,6 +1189,28 @@ pub(crate) mod tests { } } + #[test] + fn test_attach_pmem_devices() { + let mut event_manager = EventManager::new().expect("Unable to create EventManager"); + + let id = String::from("root"); + let configs = vec![PmemConfig { + id: id.clone(), + path_on_host: "".into(), + root_device: true, + read_only: true, + }]; + let mut vmm = default_vmm(); + let mut cmdline = default_kernel_cmdline(); + _ = insert_pmem_devices(&mut vmm, &mut cmdline, &mut event_manager, configs); + assert!(cmdline_contains(&cmdline, "root=/dev/pmem0 ro")); + assert!( + vmm.device_manager + .get_virtio_device(virtio_ids::VIRTIO_ID_PMEM, id.as_str()) + .is_some() + ); + } + #[test] fn test_attach_boot_timer_device() { let mut vmm = default_vmm(); diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs index a9290886997..f4f9ffe69d2 100644 --- a/src/vmm/src/device_manager/pci_mngr.rs +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -20,6 +20,8 @@ use crate::devices::virtio::device::VirtioDevice; use crate::devices::virtio::generated::virtio_ids; use crate::devices::virtio::net::Net; use crate::devices::virtio::net::persist::{NetConstructorArgs, NetState}; +use crate::devices::virtio::pmem::device::Pmem; +use crate::devices::virtio::pmem::persist::{PmemConstructorArgs, PmemState}; use crate::devices::virtio::rng::Entropy; use crate::devices::virtio::rng::persist::{EntropyConstructorArgs, EntropyState}; use crate::devices::virtio::transport::pci::device::{ @@ -238,6 +240,8 @@ pub struct PciDevicesState { pub mmds: Option, /// Entropy device state. pub entropy_device: Option>, + /// Pmem device states. + pub pmem_devices: Vec>, } pub struct PciDevicesConstructorArgs<'a> { @@ -386,6 +390,19 @@ impl<'a> Persist<'a> for PciDevices { transport_state, }) } + virtio_ids::VIRTIO_ID_PMEM => { + let pmem_dev = locked_virtio_dev + .as_mut_any() + .downcast_mut::() + .unwrap(); + let device_state = pmem_dev.save(); + state.pmem_devices.push(VirtioDeviceState { + device_id: pmem_dev.config.id.clone(), + pci_device_bdf, + device_state, + transport_state, + }); + } _ => unreachable!(), } } @@ -564,6 +581,34 @@ impl<'a> Persist<'a> for PciDevices { .unwrap() } + for pmem_state in &state.pmem_devices { + let device = Arc::new(Mutex::new( + Pmem::restore( + PmemConstructorArgs { + mem, + vm: constructor_args.vm.as_ref(), + }, + &pmem_state.device_state, + ) + .unwrap(), + )); + + constructor_args + .vm_resources + .update_from_restored_device(SharedDeviceType::Pmem(device.clone())) + .unwrap(); + + pci_devices + .restore_pci_device( + constructor_args.vm, + device, + &pmem_state.device_id, + &pmem_state.transport_state, + constructor_args.event_manager, + ) + .unwrap() + } + Ok(pci_devices) } } @@ -582,6 +627,7 @@ mod tests { use crate::vmm_config::balloon::BalloonDeviceConfig; use crate::vmm_config::entropy::EntropyDeviceConfig; use crate::vmm_config::net::NetworkInterfaceConfig; + use crate::vmm_config::pmem::PmemConfig; use crate::vmm_config::vsock::VsockDeviceConfig; #[test] @@ -589,6 +635,7 @@ mod tests { let mut buf = vec![0; 65536]; // These need to survive so the restored blocks find them. let _block_files; + let _pmem_files; let mut tmp_sock_file = TempFile::new().unwrap(); tmp_sock_file.remove().unwrap(); // Set up a vmm with one of each device, and get the serialized DeviceStates. @@ -642,6 +689,16 @@ mod tests { // Add an entropy device. let entropy_config = EntropyDeviceConfig::default(); insert_entropy_device(&mut vmm, &mut cmdline, &mut event_manager, entropy_config); + // Add a pmem device. + let pmem_id = String::from("pmem"); + let pmem_configs = vec![PmemConfig { + id: pmem_id, + path_on_host: "".into(), + root_device: true, + read_only: true, + }]; + _pmem_files = + insert_pmem_devices(&mut vmm, &mut cmdline, &mut event_manager, pmem_configs); Snapshot::new(vmm.device_manager.save()) .save(&mut buf.as_mut_slice()) @@ -730,10 +787,19 @@ mod tests { }}, "entropy": {{ "rate_limiter": null - }} + }}, + "pmem": [ + {{ + "id": "pmem", + "path_on_host": "{}", + "root_device": true, + "read_only": true + }} + ] }}"#, _block_files.last().unwrap().as_path().to_str().unwrap(), - tmp_sock_file.as_path().to_str().unwrap() + tmp_sock_file.as_path().to_str().unwrap(), + _pmem_files.last().unwrap().as_path().to_str().unwrap(), ); assert_eq!( diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index f874975981b..d8d486d9ed7 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -30,6 +30,10 @@ use crate::devices::virtio::net::persist::{ NetConstructorArgs, NetPersistError as NetError, NetState, }; use crate::devices::virtio::persist::{MmioTransportConstructorArgs, MmioTransportState}; +use crate::devices::virtio::pmem::device::Pmem; +use crate::devices::virtio::pmem::persist::{ + PmemConstructorArgs, PmemPersistError as PmemError, PmemState, +}; use crate::devices::virtio::rng::Entropy; use crate::devices::virtio::rng::persist::{ EntropyConstructorArgs, EntropyPersistError as EntropyError, EntropyState, @@ -73,6 +77,8 @@ pub enum DevicePersistError { MmdsConfig(#[from] MmdsConfigError), /// Entropy: {0} Entropy(#[from] EntropyError), + /// Pmem: {0} + Pmem(#[from] PmemError), /// Resource misconfiguration: {0}. Is the snapshot file corrupted? ResourcesError(#[from] ResourcesError), /// Could not activate device: {0} @@ -126,6 +132,8 @@ pub struct DeviceStates { pub mmds: Option, /// Entropy device state. pub entropy_device: Option>, + /// Pmem device states. + pub pmem_devices: Vec>, } /// A type used to extract the concrete `Arc>` for each of the device @@ -137,6 +145,7 @@ pub enum SharedDeviceType { Balloon(Arc>), Vsock(Arc>>), Entropy(Arc>), + Pmem(Arc>), } pub struct MMIODevManagerConstructorArgs<'a> { @@ -336,6 +345,16 @@ impl<'a> Persist<'a> for MMIODeviceManager { device_info, }); } + virtio_ids::VIRTIO_ID_PMEM => { + let pmem = locked_device.as_mut_any().downcast_mut::().unwrap(); + let device_state = pmem.save(); + states.pmem_devices.push(VirtioDeviceState { + device_id, + device_state, + transport_state, + device_info, + }) + } _ => unreachable!(), }; @@ -550,6 +569,31 @@ impl<'a> Persist<'a> for MMIODeviceManager { )?; } + for pmem_state in &state.pmem_devices { + let device = Arc::new(Mutex::new(Pmem::restore( + PmemConstructorArgs { + mem, + vm: vm.as_ref(), + }, + &pmem_state.device_state, + )?)); + + constructor_args + .vm_resources + .update_from_restored_device(SharedDeviceType::Pmem(device.clone()))?; + + restore_helper( + device.clone(), + pmem_state.device_state.virtio_state.activated, + false, + device, + &pmem_state.device_id, + &pmem_state.transport_state, + &pmem_state.device_info, + constructor_args.event_manager, + )?; + } + Ok(dev_manager) } } @@ -567,6 +611,7 @@ mod tests { use crate::vmm_config::balloon::BalloonDeviceConfig; use crate::vmm_config::entropy::EntropyDeviceConfig; use crate::vmm_config::net::NetworkInterfaceConfig; + use crate::vmm_config::pmem::PmemConfig; use crate::vmm_config::vsock::VsockDeviceConfig; impl PartialEq for VirtioDeviceState { @@ -614,6 +659,7 @@ mod tests { let mut buf = vec![0; 65536]; // These need to survive so the restored blocks find them. let _block_files; + let _pmem_files; let mut tmp_sock_file = TempFile::new().unwrap(); tmp_sock_file.remove().unwrap(); // Set up a vmm with one of each device, and get the serialized DeviceStates. @@ -666,6 +712,16 @@ mod tests { // Add an entropy device. let entropy_config = EntropyDeviceConfig::default(); insert_entropy_device(&mut vmm, &mut cmdline, &mut event_manager, entropy_config); + // Add a pmem device. + let pmem_id = String::from("pmem"); + let pmem_configs = vec![PmemConfig { + id: pmem_id, + path_on_host: "".into(), + root_device: true, + read_only: true, + }]; + _pmem_files = + insert_pmem_devices(&mut vmm, &mut cmdline, &mut event_manager, pmem_configs); Snapshot::new(vmm.device_manager.save()) .save(&mut buf.as_mut_slice()) @@ -750,10 +806,19 @@ mod tests { }}, "entropy": {{ "rate_limiter": null - }} + }}, + "pmem": [ + {{ + "id": "pmem", + "path_on_host": "{}", + "root_device": true, + "read_only": true + }} + ] }}"#, _block_files.last().unwrap().as_path().to_str().unwrap(), - tmp_sock_file.as_path().to_str().unwrap() + tmp_sock_file.as_path().to_str().unwrap(), + _pmem_files.last().unwrap().as_path().to_str().unwrap(), ); assert_eq!( diff --git a/src/vmm/src/devices/virtio/mod.rs b/src/vmm/src/devices/virtio/mod.rs index 1e9e3541720..840dbffdb5e 100644 --- a/src/vmm/src/devices/virtio/mod.rs +++ b/src/vmm/src/devices/virtio/mod.rs @@ -20,6 +20,7 @@ mod iov_deque; pub mod iovec; pub mod net; pub mod persist; +pub mod pmem; pub mod queue; pub mod rng; pub mod test_utils; diff --git a/src/vmm/src/devices/virtio/pmem/device.rs b/src/vmm/src/devices/virtio/pmem/device.rs new file mode 100644 index 00000000000..e406adc532d --- /dev/null +++ b/src/vmm/src/devices/virtio/pmem/device.rs @@ -0,0 +1,545 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::fs::{File, OpenOptions}; +use std::ops::{Deref, DerefMut}; +use std::os::fd::AsRawFd; +use std::sync::{Arc, Mutex}; + +use kvm_bindings::{KVM_MEM_READONLY, kvm_userspace_memory_region}; +use kvm_ioctls::VmFd; +use serde::{Deserialize, Serialize}; +use vm_allocator::AllocPolicy; +use vm_memory::mmap::{MmapRegionBuilder, MmapRegionError}; +use vm_memory::{GuestAddress, GuestMemoryError}; +use vmm_sys_util::eventfd::EventFd; + +use crate::devices::virtio::ActivateError; +use crate::devices::virtio::device::{ActiveState, DeviceState, VirtioDevice}; +use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; +use crate::devices::virtio::generated::virtio_ids::VIRTIO_ID_PMEM; +use crate::devices::virtio::pmem::PMEM_QUEUE_SIZE; +use crate::devices::virtio::pmem::metrics::{PmemMetrics, PmemMetricsPerDevice}; +use crate::devices::virtio::queue::{DescriptorChain, InvalidAvailIdx, Queue, QueueError}; +use crate::devices::virtio::transport::{VirtioInterrupt, VirtioInterruptType}; +use crate::logger::{IncMetric, error}; +use crate::utils::{align_up, u64_to_usize}; +use crate::vmm_config::pmem::PmemConfig; +use crate::vstate::memory::{ByteValued, Bytes, GuestMemoryMmap, GuestMmapRegion}; +use crate::{Vm, impl_device_type}; + +#[derive(Debug, thiserror::Error, displaydoc::Display)] +pub enum PmemError { + /// Cannot set the memory regions: {0} + SetUserMemoryRegion(kvm_ioctls::Error), + /// Unablet to allocate a KVM slot for the device + NoKvmSlotAvailable, + /// Error accessing backing file: {0} + BackingFile(std::io::Error), + /// Error backing file size is 0 + BackingFileZeroSize, + /// Error with EventFd: {0} + EventFd(std::io::Error), + /// Unexpected read-only descriptor + ReadOnlyDescriptor, + /// Unexpected write-only descriptor + WriteOnlyDescriptor, + /// UnknownRequestType: {0} + UnknownRequestType(u32), + /// Descriptor chain too short + DescriptorChainTooShort, + /// Guest memory error: {0} + GuestMemory(#[from] GuestMemoryError), + /// Error handling the VirtIO queue: {0} + Queue(#[from] QueueError), + /// Error during obtaining the descriptor from the queue: {0} + QueuePop(#[from] InvalidAvailIdx), +} + +const VIRTIO_PMEM_REQ_TYPE_FLUSH: u32 = 0; +const SUCCESS: i32 = 0; +const FAILURE: i32 = -1; + +#[derive(Debug, Default, Copy, Clone, Serialize, Deserialize)] +#[repr(C)] +pub struct ConfigSpace { + // Physical address of the first byte of the persistent memory region. + pub start: u64, + // Length of the address range + pub size: u64, +} + +// SAFETY: `ConfigSpace` contains only PODs in `repr(c)`, without padding. +unsafe impl ByteValued for ConfigSpace {} + +#[derive(Debug)] +pub struct Pmem { + // VirtIO fields + pub avail_features: u64, + pub acked_features: u64, + pub activate_event: EventFd, + + // Transport fields + pub device_state: DeviceState, + pub queues: Vec, + pub queue_events: Vec, + + // Pmem specific fields + pub config_space: ConfigSpace, + pub file: File, + pub file_len: u64, + pub mmap_ptr: u64, + pub metrics: Arc, + + pub config: PmemConfig, +} + +impl Pmem { + // Pmem devices need to have address and size to be + // a multiple of 2MB + pub const ALIGNMENT: u64 = 2 * 1024 * 1024; + + /// Create a new Pmem device with a backing file at `disk_image_path` path. + pub fn new(config: PmemConfig) -> Result { + Self::new_with_queues(config, vec![Queue::new(PMEM_QUEUE_SIZE)]) + } + + /// Create a new Pmem device with a backing file at `disk_image_path` path using a pre-created + /// set of queues. + pub fn new_with_queues(config: PmemConfig, queues: Vec) -> Result { + let (file, file_len, mmap_ptr, mmap_len) = + Self::mmap_backing_file(&config.path_on_host, config.read_only)?; + + Ok(Self { + avail_features: 1u64 << VIRTIO_F_VERSION_1, + acked_features: 0u64, + activate_event: EventFd::new(libc::EFD_NONBLOCK).map_err(PmemError::EventFd)?, + device_state: DeviceState::Inactive, + queues, + queue_events: vec![EventFd::new(libc::EFD_NONBLOCK).map_err(PmemError::EventFd)?], + config_space: ConfigSpace { + start: 0, + size: mmap_len, + }, + file, + file_len, + mmap_ptr, + metrics: PmemMetricsPerDevice::alloc(config.id.clone()), + config, + }) + } + + pub fn mmap_backing_file( + path: &str, + read_only: bool, + ) -> Result<(File, u64, u64, u64), PmemError> { + let file = OpenOptions::new() + .read(true) + .write(!read_only) + .open(path) + .map_err(PmemError::BackingFile)?; + let file_len = file.metadata().unwrap().len(); + if (file_len == 0) { + return Err(PmemError::BackingFileZeroSize); + } + + let mut prot = libc::PROT_READ; + if !read_only { + prot |= libc::PROT_WRITE; + } + + let mmap_len = align_up(file_len, Self::ALIGNMENT); + let mmap_ptr = if (mmap_len == file_len) { + // SAFETY: We are calling the system call with valid arguments and checking the returned + // value + unsafe { + let r = libc::mmap( + std::ptr::null_mut(), + u64_to_usize(file_len), + prot, + libc::MAP_SHARED | libc::MAP_NORESERVE, + file.as_raw_fd(), + 0, + ); + if r == libc::MAP_FAILED { + return Err(PmemError::BackingFile(std::io::Error::last_os_error())); + } + r + } + } else { + // SAFETY: We are calling system calls with valid arguments and checking returned + // values + // + // The double mapping is done to ensure the underlying memory has the size of + // `mmap_len` (wich is 2MB aligned as per `virtio-pmem` specification) + // First mmap creates a mapping of `mmap_len` while second mmaps the actual + // file on top. The remaining gap between the end of the mmaped file and + // the actual end of the memory region is backed by PRIVATE | ANONYMOUS memory. + unsafe { + let mmap_ptr = libc::mmap( + std::ptr::null_mut(), + u64_to_usize(mmap_len), + prot, + libc::MAP_PRIVATE | libc::MAP_NORESERVE | libc::MAP_ANONYMOUS, + -1, + 0, + ); + if mmap_ptr == libc::MAP_FAILED { + return Err(PmemError::BackingFile(std::io::Error::last_os_error())); + } + let r = libc::mmap( + mmap_ptr, + u64_to_usize(file_len), + prot, + libc::MAP_SHARED | libc::MAP_NORESERVE | libc::MAP_FIXED, + file.as_raw_fd(), + 0, + ); + if r == libc::MAP_FAILED { + return Err(PmemError::BackingFile(std::io::Error::last_os_error())); + } + mmap_ptr + } + }; + Ok((file, file_len, mmap_ptr as u64, mmap_len)) + } + + /// Allocater memory in past_mmio64 memory region + pub fn alloc_region(&mut self, vm: &Vm) { + let mut resource_allocator_lock = vm.resource_allocator(); + let resource_allocator = resource_allocator_lock.deref_mut(); + let addr = resource_allocator + .past_mmio64_memory + .allocate( + self.config_space.size, + Pmem::ALIGNMENT, + AllocPolicy::FirstMatch, + ) + .unwrap(); + self.config_space.start = addr.start(); + } + + /// Set user memory region in KVM + pub fn set_mem_region(&mut self, vm: &Vm) -> Result<(), PmemError> { + let next_slot = vm.next_kvm_slot().ok_or(PmemError::NoKvmSlotAvailable)?; + let memory_region = kvm_userspace_memory_region { + slot: next_slot, + guest_phys_addr: self.config_space.start, + memory_size: self.config_space.size, + userspace_addr: self.mmap_ptr, + flags: if self.config.read_only { + KVM_MEM_READONLY + } else { + 0 + }, + }; + // SAFETY: All aruments are correct + unsafe { + vm.fd() + .set_user_memory_region(memory_region) + .map_err(PmemError::SetUserMemoryRegion)?; + } + Ok(()) + } + + fn handle_queue(&mut self) -> Result<(), PmemError> { + // This is safe since we checked in the event handler that the device is activated. + let active_state = self.device_state.active_state().unwrap(); + + while let Some(head) = self.queues[0].pop()? { + let add_result = match self.process_chain(head) { + Ok(()) => self.queues[0].add_used(head.index, 4), + Err(err) => { + error!("pmem: {err}"); + self.metrics.event_fails.inc(); + self.queues[0].add_used(head.index, 0) + } + }; + if let Err(err) = add_result { + error!("pmem: {err}"); + self.metrics.event_fails.inc(); + break; + } + } + self.queues[0].advance_used_ring_idx(); + + if self.queues[0].prepare_kick() { + active_state + .interrupt + .trigger(VirtioInterruptType::Queue(0)) + .unwrap_or_else(|err| { + error!("pmem: {err}"); + self.metrics.event_fails.inc(); + }); + } + Ok(()) + } + + fn process_chain(&self, head: DescriptorChain) -> Result<(), PmemError> { + // This is safe since we checked in the event handler that the device is activated. + let active_state = self.device_state.active_state().unwrap(); + + if head.is_write_only() { + return Err(PmemError::WriteOnlyDescriptor); + } + let request: u32 = active_state.mem.read_obj(head.addr)?; + if request != VIRTIO_PMEM_REQ_TYPE_FLUSH { + return Err(PmemError::UnknownRequestType(request)); + } + let Some(status_descriptor) = head.next_descriptor() else { + return Err(PmemError::DescriptorChainTooShort); + }; + if !status_descriptor.is_write_only() { + return Err(PmemError::ReadOnlyDescriptor); + } + let mut result = SUCCESS; + // SAFETY: We are calling the system call with valid arguments and checking the returned + // value + unsafe { + let ret = libc::msync( + self.mmap_ptr as *mut libc::c_void, + u64_to_usize(self.file_len), + libc::MS_SYNC, + ); + if ret < 0 { + error!("pmem: Unable to msync the file. Error: {}", ret); + result = FAILURE; + } + } + active_state.mem.write_obj(result, status_descriptor.addr)?; + Ok(()) + } + + pub fn process_queue(&mut self) { + self.metrics.queue_event_count.inc(); + if let Err(err) = self.queue_events[0].read() { + error!("pmem: Failed to get queue event: {err:?}"); + self.metrics.event_fails.inc(); + return; + } + + self.handle_queue().unwrap_or_else(|err| { + error!("pmem: {err:?}"); + self.metrics.event_fails.inc(); + }); + } +} + +impl VirtioDevice for Pmem { + impl_device_type!(VIRTIO_ID_PMEM); + + fn avail_features(&self) -> u64 { + self.avail_features + } + + fn acked_features(&self) -> u64 { + self.acked_features + } + + fn set_acked_features(&mut self, acked_features: u64) { + self.acked_features = acked_features; + } + + fn queues(&self) -> &[Queue] { + &self.queues + } + + fn queues_mut(&mut self) -> &mut [Queue] { + &mut self.queues + } + + fn queue_events(&self) -> &[EventFd] { + &self.queue_events + } + + fn interrupt_trigger(&self) -> &dyn VirtioInterrupt { + self.device_state + .active_state() + .expect("Device not activated") + .interrupt + .deref() + } + + fn read_config(&self, offset: u64, data: &mut [u8]) { + if let Some(config_space_bytes) = self.config_space.as_slice().get(u64_to_usize(offset)..) { + let len = config_space_bytes.len().min(data.len()); + data[..len].copy_from_slice(&config_space_bytes[..len]); + } else { + error!("Failed to read config space"); + self.metrics.cfg_fails.inc(); + } + } + + fn write_config(&mut self, _offset: u64, _data: &[u8]) {} + + fn activate( + &mut self, + mem: GuestMemoryMmap, + interrupt: Arc, + ) -> Result<(), ActivateError> { + for q in self.queues.iter_mut() { + q.initialize(&mem) + .map_err(ActivateError::QueueMemoryError)?; + } + + if self.activate_event.write(1).is_err() { + self.metrics.activate_fails.inc(); + return Err(ActivateError::EventFd); + } + self.device_state = DeviceState::Activated(ActiveState { mem, interrupt }); + Ok(()) + } + + fn is_activated(&self) -> bool { + self.device_state.is_activated() + } +} + +#[cfg(test)] +mod tests { + use vmm_sys_util::tempfile::TempFile; + + use super::*; + use crate::devices::virtio::queue::{VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE}; + use crate::devices::virtio::test_utils::{VirtQueue, default_interrupt, default_mem}; + + #[test] + fn test_from_config() { + let config = PmemConfig { + id: "1".into(), + path_on_host: "not_a_path".into(), + root_device: true, + read_only: false, + }; + assert!(matches!( + Pmem::new(config).unwrap_err(), + PmemError::BackingFile(_), + )); + + let dummy_file = TempFile::new().unwrap(); + let dummy_path = dummy_file.as_path().to_str().unwrap().to_string(); + let config = PmemConfig { + id: "1".into(), + path_on_host: dummy_path.clone(), + root_device: true, + read_only: false, + }; + assert!(matches!( + Pmem::new(config).unwrap_err(), + PmemError::BackingFileZeroSize, + )); + + dummy_file.as_file().set_len(0x20_0000); + let config = PmemConfig { + id: "1".into(), + path_on_host: dummy_path, + root_device: true, + read_only: false, + }; + Pmem::new(config).unwrap(); + } + + #[test] + fn test_process_chain() { + let dummy_file = TempFile::new().unwrap(); + dummy_file.as_file().set_len(0x20_0000); + let dummy_path = dummy_file.as_path().to_str().unwrap().to_string(); + let config = PmemConfig { + id: "1".into(), + path_on_host: dummy_path, + root_device: true, + read_only: false, + }; + let mut pmem = Pmem::new(config).unwrap(); + + let mem = default_mem(); + let interrupt = default_interrupt(); + let vq = VirtQueue::new(GuestAddress(0), &mem, 16); + pmem.queues[0] = vq.create_queue(); + pmem.activate(mem.clone(), interrupt).unwrap(); + + // Valid request + { + vq.avail.ring[0].set(0); + vq.dtable[0].set(0x1000, 4, VIRTQ_DESC_F_NEXT, 1); + vq.avail.ring[1].set(1); + vq.dtable[1].set(0x2000, 4, VIRTQ_DESC_F_WRITE, 0); + mem.write_obj::(0, GuestAddress(0x1000)).unwrap(); + mem.write_obj::(0x69, GuestAddress(0x2000)).unwrap(); + + vq.used.idx.set(0); + vq.avail.idx.set(1); + let head = pmem.queues[0].pop().unwrap().unwrap(); + pmem.process_chain(head).unwrap(); + assert_eq!(mem.read_obj::(GuestAddress(0x2000)).unwrap(), 0); + } + + // Invalid request type + { + vq.avail.ring[0].set(0); + vq.dtable[0].set(0x1000, 4, VIRTQ_DESC_F_NEXT, 1); + mem.write_obj::(0x69, GuestAddress(0x1000)).unwrap(); + + pmem.queues[0] = vq.create_queue(); + vq.used.idx.set(0); + vq.avail.idx.set(1); + let head = pmem.queues[0].pop().unwrap().unwrap(); + assert!(matches!( + pmem.process_chain(head).unwrap_err(), + PmemError::UnknownRequestType(0x69), + )); + } + + // Short chain request + { + vq.avail.ring[0].set(0); + vq.dtable[0].set(0x1000, 4, 0, 1); + mem.write_obj::(0, GuestAddress(0x1000)).unwrap(); + + pmem.queues[0] = vq.create_queue(); + vq.used.idx.set(0); + vq.avail.idx.set(1); + let head = pmem.queues[0].pop().unwrap().unwrap(); + assert!(matches!( + pmem.process_chain(head).unwrap_err(), + PmemError::DescriptorChainTooShort, + )); + } + + // Write only first descriptor + { + vq.avail.ring[0].set(0); + vq.dtable[0].set(0x1000, 4, VIRTQ_DESC_F_WRITE | VIRTQ_DESC_F_NEXT, 1); + vq.avail.ring[1].set(1); + vq.dtable[1].set(0x2000, 4, VIRTQ_DESC_F_WRITE, 0); + mem.write_obj::(0, GuestAddress(0x1000)).unwrap(); + + pmem.queues[0] = vq.create_queue(); + vq.used.idx.set(0); + vq.avail.idx.set(1); + let head = pmem.queues[0].pop().unwrap().unwrap(); + assert!(matches!( + pmem.process_chain(head).unwrap_err(), + PmemError::WriteOnlyDescriptor, + )); + } + + // Read only second descriptor + { + vq.avail.ring[0].set(0); + vq.dtable[0].set(0x1000, 4, VIRTQ_DESC_F_NEXT, 1); + vq.avail.ring[1].set(1); + vq.dtable[1].set(0x2000, 4, 0, 0); + mem.write_obj::(0, GuestAddress(0x1000)).unwrap(); + + pmem.queues[0] = vq.create_queue(); + vq.used.idx.set(0); + vq.avail.idx.set(1); + let head = pmem.queues[0].pop().unwrap().unwrap(); + assert!(matches!( + pmem.process_chain(head).unwrap_err(), + PmemError::ReadOnlyDescriptor, + )); + } + } +} diff --git a/src/vmm/src/devices/virtio/pmem/event_handler.rs b/src/vmm/src/devices/virtio/pmem/event_handler.rs new file mode 100644 index 00000000000..36af5c640c8 --- /dev/null +++ b/src/vmm/src/devices/virtio/pmem/event_handler.rs @@ -0,0 +1,84 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use event_manager::{EventOps, EventSet, Events, MutEventSubscriber}; +use log::{error, warn}; + +use super::device::Pmem; +use crate::devices::virtio::device::VirtioDevice; + +impl Pmem { + const PROCESS_ACTIVATE: u32 = 0; + const PROCESS_PMEM_QUEUE: u32 = 1; + + fn register_runtime_events(&self, ops: &mut EventOps) { + if let Err(err) = ops.add(Events::with_data( + &self.queue_events[0], + Self::PROCESS_PMEM_QUEUE, + EventSet::IN, + )) { + error!("pmem: Failed to register queue event: {err}"); + } + } + + fn register_activate_event(&self, ops: &mut EventOps) { + if let Err(err) = ops.add(Events::with_data( + &self.activate_event, + Self::PROCESS_ACTIVATE, + EventSet::IN, + )) { + error!("pmem: Failed to register activate event: {err}"); + } + } + + fn process_activate_event(&self, ops: &mut EventOps) { + if let Err(err) = self.activate_event.read() { + error!("pmem: Failed to consume activate event: {err}"); + } + + // Register runtime events + self.register_runtime_events(ops); + + // Remove activate event + if let Err(err) = ops.remove(Events::with_data( + &self.activate_event, + Self::PROCESS_ACTIVATE, + EventSet::IN, + )) { + error!("pmem: Failed to unregister activate event: {err}"); + } + } +} + +impl MutEventSubscriber for Pmem { + fn init(&mut self, ops: &mut EventOps) { + if self.is_activated() { + self.register_runtime_events(ops) + } else { + self.register_activate_event(ops) + } + } + + fn process(&mut self, events: Events, ops: &mut EventOps) { + let event_set = events.event_set(); + let source = events.data(); + + if !event_set.contains(EventSet::IN) { + warn!("pmem: Received unknown event: {event_set:#?} from source {source}"); + return; + } + + if !self.is_activated() { + warn!("pmem: The device is not activated yet. Spurious event received from {source}"); + return; + } + + match source { + Self::PROCESS_ACTIVATE => self.process_activate_event(ops), + Self::PROCESS_PMEM_QUEUE => self.process_queue(), + _ => { + warn!("pmem: Unknown event received: {source}"); + } + } + } +} diff --git a/src/vmm/src/devices/virtio/pmem/metrics.rs b/src/vmm/src/devices/virtio/pmem/metrics.rs new file mode 100644 index 00000000000..02348b4ca51 --- /dev/null +++ b/src/vmm/src/devices/virtio/pmem/metrics.rs @@ -0,0 +1,244 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Defines the metrics system for pmem devices. +//! +//! # Metrics format +//! The metrics are flushed in JSON when requested by vmm::logger::metrics::METRICS.write(). +//! +//! ## JSON example with metrics: +//! ```json +//! { +//! "pmem_drv0": { +//! "activate_fails": "SharedIncMetric", +//! "cfg_fails": "SharedIncMetric", +//! "no_avail_buffer": "SharedIncMetric", +//! "event_fails": "SharedIncMetric", +//! "execute_fails": "SharedIncMetric", +//! ... +//! } +//! "pmem_drv1": { +//! "activate_fails": "SharedIncMetric", +//! "cfg_fails": "SharedIncMetric", +//! "no_avail_buffer": "SharedIncMetric", +//! "event_fails": "SharedIncMetric", +//! "execute_fails": "SharedIncMetric", +//! ... +//! } +//! ... +//! "pmem_drive_id": { +//! "activate_fails": "SharedIncMetric", +//! "cfg_fails": "SharedIncMetric", +//! "no_avail_buffer": "SharedIncMetric", +//! "event_fails": "SharedIncMetric", +//! "execute_fails": "SharedIncMetric", +//! ... +//! } +//! "pmem": { +//! "activate_fails": "SharedIncMetric", +//! "cfg_fails": "SharedIncMetric", +//! "no_avail_buffer": "SharedIncMetric", +//! "event_fails": "SharedIncMetric", +//! "execute_fails": "SharedIncMetric", +//! ... +//! } +//! } +//! ``` +//! Each `pmem` field in the example above is a serializable `PmemDeviceMetrics` structure +//! collecting metrics such as `activate_fails`, `cfg_fails`, etc. for the pmem device. +//! `pmem_drv0` represent metrics for the endpoint "/pmem/drv0", +//! `pmem_drv1` represent metrics for the endpoint "/pmem/drv1", and +//! `pmem_drive_id` represent metrics for the endpoint "/pmem/{drive_id}" +//! pmem device respectively and `pmem` is the aggregate of all the per device metrics. +//! +//! # Limitations +//! pmem device currently do not have `vmm::logger::metrics::StoreMetrics` so aggregate +//! doesn't consider them. +//! +//! # Design +//! The main design goals of this system are: +//! * To improve pmem device metrics by logging them at per device granularity. +//! * Continue to provide aggregate pmem metrics to maintain backward compatibility. +//! * Move PmemDeviceMetrics out of from logger and decouple it. +//! * Rely on `serde` to provide the actual serialization for writing the metrics. +//! * Since all metrics start at 0, we implement the `Default` trait via derive for all of them, to +//! avoid having to initialize everything by hand. +//! +//! * Devices could be created in any order i.e. the first device created could either be drv0 or +//! drv1 so if we use a vector for PmemDeviceMetrics and call 1st device as pmem0, then pmem0 +//! could sometimes point to drv0 and sometimes to drv1 which doesn't help with analysing the +//! metrics. So, use Map instead of Vec to help understand which drive the metrics actually +//! belongs to. +//! +//! The system implements 1 type of metrics: +//! * Shared Incremental Metrics (SharedIncMetrics) - dedicated for the metrics which need a counter +//! (i.e the number of times an API request failed). These metrics are reset upon flush. +//! +//! We add PmemDeviceMetrics entries from pmem::metrics::METRICS into Pmem device instead of +//! Pmem device having individual separate PmemDeviceMetrics entries because Pmem device is not +//! accessible from signal handlers to flush metrics and pmem::metrics::METRICS is. + +use std::collections::BTreeMap; +use std::sync::{Arc, RwLock}; + +use serde::ser::SerializeMap; +use serde::{Serialize, Serializer}; + +use crate::logger::{IncMetric, LatencyAggregateMetrics, SharedIncMetric}; + +/// map of pmem drive id and metrics +/// this should be protected by a lock before accessing. +#[derive(Debug)] +pub struct PmemMetricsPerDevice { + /// used to access per pmem device metrics + pub metrics: BTreeMap>, +} + +impl PmemMetricsPerDevice { + /// Allocate `PmemDeviceMetrics` for pmem device having + /// id `drive_id`. Also, allocate only if it doesn't + /// exist to avoid overwriting previously allocated data. + /// lock is always initialized so it is safe the unwrap + /// the lock without a check. + pub fn alloc(drive_id: String) -> Arc { + Arc::clone( + METRICS + .write() + .unwrap() + .metrics + .entry(drive_id) + .or_insert_with(|| Arc::new(PmemMetrics::default())), + ) + } +} + +/// Pool of pmem-related metrics per device behind a lock to +/// keep things thread safe. Since the lock is initialized here +/// it is safe to unwrap it without any check. +static METRICS: RwLock = RwLock::new(PmemMetricsPerDevice { + metrics: BTreeMap::new(), +}); + +/// This function facilitates aggregation and serialization of +/// per pmem device metrics. +pub fn flush_metrics(serializer: S) -> Result { + let pmem_metrics = METRICS.read().unwrap(); + let metrics_len = pmem_metrics.metrics.len(); + // +1 to accommodate aggregate pmem metrics + let mut seq = serializer.serialize_map(Some(1 + metrics_len))?; + + let mut pmem_aggregated: PmemMetrics = PmemMetrics::default(); + + for (name, metrics) in pmem_metrics.metrics.iter() { + let devn = format!("pmem_{}", name); + // serialization will flush the metrics so aggregate before it. + let m: &PmemMetrics = metrics; + pmem_aggregated.aggregate(m); + seq.serialize_entry(&devn, m)?; + } + seq.serialize_entry("pmem", &pmem_aggregated)?; + seq.end() +} + +/// Pmem Device associated metrics. +#[derive(Debug, Default, Serialize)] +pub struct PmemMetrics { + /// Number of times when activate failed on a pmem device. + pub activate_fails: SharedIncMetric, + /// Number of times when interacting with the space config of a pmem device failed. + pub cfg_fails: SharedIncMetric, + /// Number of times when handling events on a pmem device failed. + pub event_fails: SharedIncMetric, + /// Number of events triggered on the queue of this pmem device. + pub queue_event_count: SharedIncMetric, +} + +impl PmemMetrics { + /// Const default construction. + pub fn new() -> Self { + Self { + ..Default::default() + } + } + + /// pmem metrics are SharedIncMetric where the diff of current vs + /// old is serialized i.e. serialize_u64(current-old). + /// So to have the aggregate serialized in same way we need to + /// fetch the diff of current vs old metrics and add it to the + /// aggregate. + pub fn aggregate(&mut self, other: &Self) { + self.activate_fails.add(other.activate_fails.fetch_diff()); + self.cfg_fails.add(other.cfg_fails.fetch_diff()); + self.event_fails.add(other.event_fails.fetch_diff()); + self.queue_event_count + .add(other.queue_event_count.fetch_diff()); + } +} + +#[cfg(test)] +pub mod tests { + use super::*; + + #[test] + fn test_max_pmem_dev_metrics() { + // Note: this test has nothing to do with + // pmem structure or IRQs, this is just to allocate + // metrics for max number of devices that system can have. + // We have 5-23 IRQ for pmem devices on x86_64 so, there + // are 19 pmem devices at max. And, even though we have more + // devices on aarch64 but we stick to 19 to keep test common. + const MAX_PMEM_DEVICES: usize = 19; + + // This is to make sure that RwLock for pmem::metrics::METRICS is good. + drop(METRICS.read().unwrap()); + drop(METRICS.write().unwrap()); + + // pmem::metrics::METRICS is in short RwLock on Vec of PmemDeviceMetrics. + // Normally, pointer to unique entries of pmem::metrics::METRICS are stored + // in Pmem device so that Pmem device can do self.metrics.* to + // update a metric. We try to do something similar here without + // using Pmem device by allocating max number of + // PmemDeviceMetrics in pmem::metrics::METRICS and store pointer to + // each entry in the local `metrics` vec. + // We then update 1 IncMetric and 2 SharedMetric for each metrics + // and validate if the metrics for per device was updated as + // expected. + let mut metrics: Vec> = Vec::new(); + for i in 0..MAX_PMEM_DEVICES { + let pmem_name: String = format!("pmem{}", i); + metrics.push(PmemMetricsPerDevice::alloc(pmem_name.clone())); + // update IncMetric + metrics[i].activate_fails.inc(); + + if i == 0 { + // Unit tests run in parallel and we have + // `test_single_pmem_dev_metrics` that also increases + // the IncMetric count of drv0 by 1 (intentional to check + // thread safety) so we check if the count is >=1. + assert!(metrics[i].activate_fails.count() >= 1); + } else { + assert!(metrics[i].activate_fails.count() == 1); + } + } + } + + #[test] + fn test_single_pmem_dev_metrics() { + let test_metrics = PmemMetricsPerDevice::alloc(String::from("pmem0")); + // Test to update IncMetrics + test_metrics.activate_fails.inc(); + assert!( + test_metrics.activate_fails.count() > 0, + "{}", + test_metrics.activate_fails.count() + ); + + // We expect only 2 tests (this and test_max_pmem_dev_metrics) + // to update activate_fails count for pmem0. + assert!( + test_metrics.activate_fails.count() <= 2, + "{}", + test_metrics.activate_fails.count() + ); + } +} diff --git a/src/vmm/src/devices/virtio/pmem/mod.rs b/src/vmm/src/devices/virtio/pmem/mod.rs new file mode 100644 index 00000000000..bd50a1993fe --- /dev/null +++ b/src/vmm/src/devices/virtio/pmem/mod.rs @@ -0,0 +1,10 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +pub mod device; +pub mod event_handler; +pub mod metrics; +pub mod persist; + +pub const PMEM_NUM_QUEUES: usize = 1; +pub const PMEM_QUEUE_SIZE: u16 = 256; diff --git a/src/vmm/src/devices/virtio/pmem/persist.rs b/src/vmm/src/devices/virtio/pmem/persist.rs new file mode 100644 index 00000000000..a8089433b86 --- /dev/null +++ b/src/vmm/src/devices/virtio/pmem/persist.rs @@ -0,0 +1,131 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use serde::{Deserialize, Serialize}; +use vm_memory::GuestAddress; + +use super::device::{ConfigSpace, Pmem, PmemError}; +use crate::Vm; +use crate::devices::virtio::device::DeviceState; +use crate::devices::virtio::generated::virtio_ids::VIRTIO_ID_PMEM; +use crate::devices::virtio::persist::{PersistError as VirtioStateError, VirtioDeviceState}; +use crate::devices::virtio::pmem::{PMEM_NUM_QUEUES, PMEM_QUEUE_SIZE}; +use crate::snapshot::Persist; +use crate::vmm_config::pmem::PmemConfig; +use crate::vstate::memory::{GuestMemoryMmap, GuestRegionMmap}; +use crate::vstate::vm::VmError; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PmemState { + pub virtio_state: VirtioDeviceState, + pub config_space: ConfigSpace, + pub config: PmemConfig, +} + +#[derive(Debug)] +pub struct PmemConstructorArgs<'a> { + pub mem: &'a GuestMemoryMmap, + pub vm: &'a Vm, +} + +#[derive(Debug, thiserror::Error, displaydoc::Display)] +pub enum PmemPersistError { + /// Error resetting VirtIO state: {0} + VirtioState(#[from] VirtioStateError), + /// Error creating Pmem devie: {0} + Pmem(#[from] PmemError), + /// Error registering memory region: {0} + Vm(#[from] VmError), +} + +impl<'a> Persist<'a> for Pmem { + type State = PmemState; + type ConstructorArgs = PmemConstructorArgs<'a>; + type Error = PmemPersistError; + + fn save(&self) -> Self::State { + PmemState { + virtio_state: VirtioDeviceState::from_device(self), + config_space: self.config_space, + config: self.config.clone(), + } + } + + fn restore( + constructor_args: Self::ConstructorArgs, + state: &Self::State, + ) -> Result { + let queues = state.virtio_state.build_queues_checked( + constructor_args.mem, + VIRTIO_ID_PMEM, + PMEM_NUM_QUEUES, + PMEM_QUEUE_SIZE, + )?; + + let mut pmem = Pmem::new_with_queues(state.config.clone(), queues)?; + pmem.config_space = state.config_space; + pmem.avail_features = state.virtio_state.avail_features; + pmem.acked_features = state.virtio_state.acked_features; + + pmem.set_mem_region(constructor_args.vm)?; + + Ok(pmem) + } +} + +#[cfg(test)] +mod tests { + use vmm_sys_util::tempfile::TempFile; + + use super::*; + use crate::arch::Kvm; + use crate::devices::virtio::device::VirtioDevice; + use crate::devices::virtio::test_utils::default_mem; + use crate::snapshot::Snapshot; + + #[test] + fn test_persistence() { + // We create the backing file here so that it exists for the whole lifetime of the test. + let dummy_file = TempFile::new().unwrap(); + dummy_file.as_file().set_len(0x20_0000); + let dummy_path = dummy_file.as_path().to_str().unwrap().to_string(); + let config = PmemConfig { + id: "1".into(), + path_on_host: dummy_path, + root_device: true, + read_only: false, + }; + let pmem = Pmem::new(config).unwrap(); + let guest_mem = default_mem(); + let kvm = Kvm::new(vec![]).unwrap(); + let vm = Vm::new(&kvm).unwrap(); + + // Save the block device. + let mut mem = vec![0; 4096]; + + Snapshot::new(pmem.save()) + .save(&mut mem.as_mut_slice()) + .unwrap(); + + // Restore the block device. + let restored_pmem = Pmem::restore( + PmemConstructorArgs { + mem: &guest_mem, + vm: &vm, + }, + &Snapshot::load_without_crc_check(mem.as_slice()) + .unwrap() + .data, + ) + .unwrap(); + + // Test that virtio specific fields are the same. + assert_eq!(restored_pmem.device_type(), VIRTIO_ID_PMEM); + assert_eq!(restored_pmem.avail_features(), pmem.avail_features()); + assert_eq!(restored_pmem.acked_features(), pmem.acked_features()); + assert_eq!(restored_pmem.queues(), pmem.queues()); + assert!(!pmem.is_activated()); + assert!(!restored_pmem.is_activated()); + assert_eq!(restored_pmem.config, pmem.config); + } +} diff --git a/src/vmm/src/logger/metrics.rs b/src/vmm/src/logger/metrics.rs index 4b80cf2f2f5..527ba911461 100644 --- a/src/vmm/src/logger/metrics.rs +++ b/src/vmm/src/logger/metrics.rs @@ -75,6 +75,7 @@ use crate::devices::legacy; use crate::devices::virtio::balloon::metrics as balloon_metrics; use crate::devices::virtio::block::virtio::metrics as block_metrics; use crate::devices::virtio::net::metrics as net_metrics; +use crate::devices::virtio::pmem::metrics as pmem_metrics; use crate::devices::virtio::rng::metrics as entropy_metrics; use crate::devices::virtio::vhost_user_metrics; use crate::devices::virtio::vsock::metrics as vsock_metrics; @@ -415,6 +416,10 @@ pub struct PutRequestsMetrics { pub vsock_count: SharedIncMetric, /// Number of failures in creating a vsock device. pub vsock_fails: SharedIncMetric, + /// Number of PUTs triggering a pmem attach. + pub pmem_count: SharedIncMetric, + /// Number of failures in attaching a pmem device. + pub pmem_fails: SharedIncMetric, /// Number of PUTs to /serial pub serial_count: SharedIncMetric, /// Number of failed PUTs to /serial @@ -444,6 +449,8 @@ impl PutRequestsMetrics { mmds_fails: SharedIncMetric::new(), vsock_count: SharedIncMetric::new(), vsock_fails: SharedIncMetric::new(), + pmem_count: SharedIncMetric::new(), + pmem_fails: SharedIncMetric::new(), serial_count: SharedIncMetric::new(), serial_fails: SharedIncMetric::new(), } @@ -867,6 +874,7 @@ create_serialize_proxy!(VhostUserMetricsSerializeProxy, vhost_user_metrics); create_serialize_proxy!(BalloonMetricsSerializeProxy, balloon_metrics); create_serialize_proxy!(EntropyMetricsSerializeProxy, entropy_metrics); create_serialize_proxy!(VsockMetricsSerializeProxy, vsock_metrics); +create_serialize_proxy!(PmemMetricsSerializeProxy, pmem_metrics); create_serialize_proxy!(LegacyDevMetricsSerializeProxy, legacy); /// Structure storing all metrics while enforcing serialization support on them. @@ -916,6 +924,9 @@ pub struct FirecrackerMetrics { /// Metrics related to virtio-rng entropy device. pub entropy_ser: EntropyMetricsSerializeProxy, #[serde(flatten)] + /// Metrics related to virtio-pmem entropy device. + pub pmem_ser: PmemMetricsSerializeProxy, + #[serde(flatten)] /// Vhost-user device related metrics. pub vhost_user_ser: VhostUserMetricsSerializeProxy, /// Interrupt related metrics @@ -944,6 +955,7 @@ impl FirecrackerMetrics { signals: SignalMetrics::new(), vsock_ser: VsockMetricsSerializeProxy {}, entropy_ser: EntropyMetricsSerializeProxy {}, + pmem_ser: PmemMetricsSerializeProxy {}, vhost_user_ser: VhostUserMetricsSerializeProxy {}, interrupts: InterruptMetrics::new(), } diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index 0d2f4bbed22..c9f40728763 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -28,6 +28,7 @@ use crate::vmm_config::machine_config::{ use crate::vmm_config::metrics::{MetricsConfig, MetricsConfigError, init_metrics}; use crate::vmm_config::mmds::{MmdsConfig, MmdsConfigError}; use crate::vmm_config::net::*; +use crate::vmm_config::pmem::{PmemBuilder, PmemConfig, PmemConfigError}; use crate::vmm_config::serial::SerialConfig; use crate::vmm_config::vsock::*; use crate::vstate::memory; @@ -62,6 +63,8 @@ pub enum ResourcesError { VsockDevice(#[from] VsockConfigError), /// Entropy device error: {0} EntropyDevice(#[from] EntropyDeviceError), + /// Pmem device error: {0} + PmemDevice(#[from] PmemConfigError), } #[derive(Serialize, Deserialize, PartialEq, Eq, Debug)] @@ -87,6 +90,8 @@ pub struct VmmConfig { network_interfaces: Vec, vsock: Option, entropy: Option, + #[serde(default, rename = "pmem")] + pmem_devices: Vec, #[serde(skip)] serial_config: Option, } @@ -109,6 +114,8 @@ pub struct VmResources { pub net_builder: NetBuilder, /// The entropy device builder. pub entropy: EntropyDeviceBuilder, + /// The pmem devices. + pub pmem: PmemBuilder, /// The optional Mmds data store. // This is initialised on demand (if ever used), so that we don't allocate it unless it's // actually used. @@ -198,6 +205,10 @@ impl VmResources { resources.build_entropy_device(entropy_device_config)?; } + for pmem_config in vmm_config.pmem_devices.into_iter() { + resources.build_pmem_device(pmem_config)?; + } + if let Some(serial_cfg) = vmm_config.serial_config { resources.serial_out_path = serial_cfg.serial_out_path; } @@ -228,11 +239,9 @@ impl VmResources { SharedDeviceType::VirtioBlock(block) => { self.block.add_virtio_device(block); } - SharedDeviceType::Network(network) => { self.net_builder.add_device(network); } - SharedDeviceType::Balloon(balloon) => { self.balloon.set_device(balloon); @@ -240,13 +249,15 @@ impl VmResources { return Err(ResourcesError::BalloonDevice(BalloonConfigError::HugePages)); } } - SharedDeviceType::Vsock(vsock) => { self.vsock.set_device(vsock); } SharedDeviceType::Entropy(entropy) => { self.entropy.set_device(entropy); } + SharedDeviceType::Pmem(pmem) => { + self.pmem.add_device(pmem); + } } Ok(()) @@ -364,7 +375,8 @@ impl VmResources { &mut self, block_device_config: BlockDeviceConfig, ) -> Result<(), DriveError> { - self.block.insert(block_device_config) + let has_pmem_root = self.pmem.has_root_device(); + self.block.insert(block_device_config, has_pmem_root) } /// Builds a network device to be attached when the VM starts. @@ -389,6 +401,12 @@ impl VmResources { self.entropy.insert(body) } + /// Builds a pmem device to be attached when the VM starts. + pub fn build_pmem_device(&mut self, body: PmemConfig) -> Result<(), PmemConfigError> { + let has_block_root = self.block.has_root_device(); + self.pmem.build(body, has_block_root) + } + /// Setter for mmds config. pub fn set_mmds_config( &mut self, @@ -515,6 +533,7 @@ impl From<&VmResources> for VmmConfig { network_interfaces: resources.net_builder.configs(), vsock: resources.vsock.config(), entropy: resources.entropy.config(), + pmem_devices: resources.pmem.configs(), // serial_config is marked serde(skip) so that it doesnt end up in snapshots. serial_config: None, } @@ -597,7 +616,7 @@ mod tests { fn default_blocks() -> BlockBuilder { let mut blocks = BlockBuilder::new(); let (cfg, _file) = default_block_cfg(); - blocks.insert(cfg).unwrap(); + blocks.insert(cfg, false).unwrap(); blocks } @@ -627,6 +646,7 @@ mod tests { boot_timer: false, mmds_size_limit: HTTP_MAX_PAYLOAD_SIZE, entropy: Default::default(), + pmem: Default::default(), pci_enabled: false, serial_out_path: None, } diff --git a/src/vmm/src/rpc_interface.rs b/src/vmm/src/rpc_interface.rs index e0701e740ef..6bae98f3546 100644 --- a/src/vmm/src/rpc_interface.rs +++ b/src/vmm/src/rpc_interface.rs @@ -33,6 +33,7 @@ use crate::vmm_config::mmds::{MmdsConfig, MmdsConfigError}; use crate::vmm_config::net::{ NetworkInterfaceConfig, NetworkInterfaceError, NetworkInterfaceUpdateConfig, }; +use crate::vmm_config::pmem::{PmemConfig, PmemConfigError}; use crate::vmm_config::serial::SerialConfig; use crate::vmm_config::snapshot::{CreateSnapshotParams, LoadSnapshotParams, SnapshotType}; use crate::vmm_config::vsock::{VsockConfigError, VsockDeviceConfig}; @@ -75,6 +76,8 @@ pub enum VmmAction { /// Add a new block device or update one that already exists using the `BlockDeviceConfig` as /// input. This action can only be called before the microVM has booted. InsertBlockDevice(BlockDeviceConfig), + /// Add a virtio-pmem device. + InsertPmemDevice(PmemConfig), /// Add a new network interface config or update one that already exists using the /// `NetworkInterfaceConfig` as input. This action can only be called before the microVM has /// booted. @@ -143,6 +146,8 @@ pub enum VmmActionError { DriveConfig(#[from] DriveError), /// Entropy device error: {0} EntropyDevice(#[from] EntropyDeviceError), + /// Pmem device error: {0} + PmemDevice(#[from] PmemConfigError), /// Internal VMM error: {0} InternalVmm(#[from] VmmError), /// Load snapshot error: {0} @@ -432,6 +437,7 @@ impl<'a> PrebootApiController<'a> { GetVmInstanceInfo => Ok(VmmData::InstanceInformation(self.instance_info.clone())), GetVmmVersion => Ok(VmmData::VmmVersion(self.instance_info.vmm_version.clone())), InsertBlockDevice(config) => self.insert_block_device(config), + InsertPmemDevice(config) => self.insert_pmem_device(config), InsertNetworkDevice(config) => self.insert_net_device(config), LoadSnapshot(config) => self .load_snapshot(&config) @@ -489,6 +495,14 @@ impl<'a> PrebootApiController<'a> { .map_err(VmmActionError::NetworkConfig) } + fn insert_pmem_device(&mut self, cfg: PmemConfig) -> Result { + self.boot_path = true; + self.vm_resources + .build_pmem_device(cfg) + .map(|()| VmmData::Empty) + .map_err(VmmActionError::PmemDevice) + } + fn set_balloon_device(&mut self, cfg: BalloonDeviceConfig) -> Result { self.boot_path = true; self.vm_resources @@ -687,6 +701,7 @@ impl RuntimeApiController { | ConfigureMetrics(_) | ConfigureSerial(_) | InsertBlockDevice(_) + | InsertPmemDevice(_) | InsertNetworkDevice(_) | LoadSnapshot(_) | PutCpuConfiguration(_) @@ -1272,5 +1287,11 @@ mod tests { check_unsupported(runtime_request(VmmAction::SetEntropyDevice( EntropyDeviceConfig::default(), ))); + check_unsupported(runtime_request(VmmAction::InsertPmemDevice(PmemConfig { + id: String::new(), + path_on_host: String::new(), + root_device: false, + read_only: false, + }))); } } diff --git a/src/vmm/src/vmm_config/drive.rs b/src/vmm/src/vmm_config/drive.rs index 9e301eff751..adf8083f74d 100644 --- a/src/vmm/src/vmm_config/drive.rs +++ b/src/vmm/src/vmm_config/drive.rs @@ -16,6 +16,8 @@ use crate::devices::virtio::block::{BlockError, CacheType}; /// Errors associated with the operations allowed on a drive. #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum DriveError { + /// Attempt to add block as a root device while the root device defined as a pmem device + AddingSecondRootDevice, /// Unable to create the virtio block device: {0} CreateBlockDevice(BlockError), /// Cannot create RateLimiter: {0} @@ -99,7 +101,7 @@ impl BlockBuilder { } /// Specifies whether there is a root block device already present in the list. - fn has_root_device(&self) -> bool { + pub fn has_root_device(&self) -> bool { // If there is a root device, it would be at the top of the list. if let Some(block) = self.devices.front() { block.lock().expect("Poisoned lock").root_device() @@ -127,11 +129,19 @@ impl BlockBuilder { /// Inserts a `Block` in the block devices list using the specified configuration. /// If a block with the same id already exists, it will overwrite it. /// Inserting a secondary root block device will fail. - pub fn insert(&mut self, config: BlockDeviceConfig) -> Result<(), DriveError> { + pub fn insert( + &mut self, + config: BlockDeviceConfig, + has_pmem_root: bool, + ) -> Result<(), DriveError> { let position = self.get_index_of_drive_id(&config.drive_id); let has_root_device = self.has_root_device(); let configured_as_root = config.is_root_device; + if configured_as_root && has_pmem_root { + return Err(DriveError::AddingSecondRootDevice); + } + // Don't allow adding a second root block device. // If the new device cfg is root and not an update to the existing root, fail fast. if configured_as_root && has_root_device && position != Some(0) { @@ -234,7 +244,9 @@ mod tests { }; let mut block_devs = BlockBuilder::new(); - block_devs.insert(dummy_block_device.clone()).unwrap(); + block_devs + .insert(dummy_block_device.clone(), false) + .unwrap(); assert!(!block_devs.has_root_device()); assert_eq!(block_devs.devices.len(), 1); @@ -266,7 +278,9 @@ mod tests { }; let mut block_devs = BlockBuilder::new(); - block_devs.insert(dummy_block_device.clone()).unwrap(); + block_devs + .insert(dummy_block_device.clone(), false) + .unwrap(); assert!(block_devs.has_root_device()); assert_eq!(block_devs.devices.len(), 1); @@ -276,6 +290,36 @@ mod tests { assert_eq!(block.read_only(), dummy_block_device.is_read_only.unwrap()); } + #[test] + fn test_add_one_root_block_device_with_pmem_already_as_root() { + let dummy_file = TempFile::new().unwrap(); + let dummy_path = dummy_file.as_path().to_str().unwrap().to_string(); + + let dummy_block_device = BlockDeviceConfig { + drive_id: String::from("1"), + partuuid: None, + is_root_device: true, + cache_type: CacheType::Unsafe, + + is_read_only: Some(true), + path_on_host: Some(dummy_path), + rate_limiter: None, + file_engine_type: None, + + socket: None, + }; + + let mut block_devs = BlockBuilder::new(); + assert!(matches!( + block_devs + .insert(dummy_block_device.clone(), true) + .unwrap_err(), + DriveError::AddingSecondRootDevice, + )); + assert!(!block_devs.has_root_device()); + assert_eq!(block_devs.devices.len(), 0); + } + #[test] fn test_add_two_root_block_devs() { let dummy_file_1 = TempFile::new().unwrap(); @@ -311,9 +355,9 @@ mod tests { }; let mut block_devs = BlockBuilder::new(); - block_devs.insert(root_block_device_1).unwrap(); + block_devs.insert(root_block_device_1, false).unwrap(); assert_eq!( - block_devs.insert(root_block_device_2).unwrap_err(), + block_devs.insert(root_block_device_2, false).unwrap_err(), DriveError::RootBlockDeviceAlreadyAdded ); } @@ -370,9 +414,9 @@ mod tests { }; let mut block_devs = BlockBuilder::new(); - block_devs.insert(dummy_block_dev_2.clone()).unwrap(); - block_devs.insert(dummy_block_dev_3.clone()).unwrap(); - block_devs.insert(root_block_device.clone()).unwrap(); + block_devs.insert(dummy_block_dev_2.clone(), false).unwrap(); + block_devs.insert(dummy_block_dev_3.clone(), false).unwrap(); + block_devs.insert(root_block_device.clone(), false).unwrap(); assert_eq!(block_devs.devices.len(), 3); @@ -443,9 +487,9 @@ mod tests { }; let mut block_devs = BlockBuilder::new(); - block_devs.insert(dummy_block_dev_2.clone()).unwrap(); - block_devs.insert(dummy_block_dev_3.clone()).unwrap(); - block_devs.insert(root_block_device.clone()).unwrap(); + block_devs.insert(dummy_block_dev_2.clone(), false).unwrap(); + block_devs.insert(dummy_block_dev_3.clone(), false).unwrap(); + block_devs.insert(root_block_device.clone(), false).unwrap(); assert_eq!(block_devs.devices.len(), 3); @@ -503,8 +547,10 @@ mod tests { let mut block_devs = BlockBuilder::new(); // Add 2 block devices. - block_devs.insert(root_block_device).unwrap(); - block_devs.insert(dummy_block_device_2.clone()).unwrap(); + block_devs.insert(root_block_device, false).unwrap(); + block_devs + .insert(dummy_block_device_2.clone(), false) + .unwrap(); // Get index zero. assert_eq!( @@ -528,7 +574,9 @@ mod tests { ); // Update OK. dummy_block_device_2.is_read_only = Some(true); - block_devs.insert(dummy_block_device_2.clone()).unwrap(); + block_devs + .insert(dummy_block_device_2.clone(), false) + .unwrap(); let index = block_devs .get_index_of_drive_id(&dummy_block_device_2.drive_id) @@ -540,7 +588,7 @@ mod tests { let dummy_path_3 = String::from("test_update_3"); dummy_block_device_2.path_on_host = Some(dummy_path_3); assert!(matches!( - block_devs.insert(dummy_block_device_2.clone()), + block_devs.insert(dummy_block_device_2.clone(), false), Err(DriveError::CreateBlockDevice(BlockError::VirtioBackend( VirtioBlockError::BackingFile(_, _) ))) @@ -550,7 +598,7 @@ mod tests { dummy_block_device_2.path_on_host = Some(dummy_path_2.clone()); dummy_block_device_2.is_root_device = true; assert_eq!( - block_devs.insert(dummy_block_device_2), + block_devs.insert(dummy_block_device_2, false), Err(DriveError::RootBlockDeviceAlreadyAdded) ); @@ -584,9 +632,9 @@ mod tests { socket: None, }; - block_devs.insert(root_block_device_old).unwrap(); + block_devs.insert(root_block_device_old, false).unwrap(); let root_block_id = root_block_device_new.drive_id.clone(); - block_devs.insert(root_block_device_new).unwrap(); + block_devs.insert(root_block_device_new, false).unwrap(); assert!(block_devs.has_root_device()); // Verify it's been moved to the first position. assert_eq!(block_devs.devices[0].lock().unwrap().id(), root_block_id); @@ -611,7 +659,9 @@ mod tests { }; let mut block_devs = BlockBuilder::new(); - block_devs.insert(dummy_block_device.clone()).unwrap(); + block_devs + .insert(dummy_block_device.clone(), false) + .unwrap(); let configs = block_devs.configs(); assert_eq!(configs.len(), 1); diff --git a/src/vmm/src/vmm_config/mod.rs b/src/vmm/src/vmm_config/mod.rs index 0e244ad4328..d266328f3a8 100644 --- a/src/vmm/src/vmm_config/mod.rs +++ b/src/vmm/src/vmm_config/mod.rs @@ -30,6 +30,8 @@ pub mod metrics; pub mod mmds; /// Wrapper for configuring the network devices attached to the microVM. pub mod net; +/// Wrapper for configuring the pmem devises attached to the microVM. +pub mod pmem; /// Wrapper for configuring microVM snapshots and the microVM state. pub mod serial; pub mod snapshot; diff --git a/src/vmm/src/vmm_config/pmem.rs b/src/vmm/src/vmm_config/pmem.rs new file mode 100644 index 00000000000..e8eec6b07ed --- /dev/null +++ b/src/vmm/src/vmm_config/pmem.rs @@ -0,0 +1,172 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use std::sync::{Arc, Mutex}; + +use serde::{Deserialize, Serialize}; + +use crate::devices::virtio::pmem::device::{Pmem, PmemError}; + +/// Errors associated wit the operations allowed on a pmem device +#[derive(Debug, thiserror::Error, displaydoc::Display)] +pub enum PmemConfigError { + /// Attempt to add pmem as a root device while the root device defined as a block device + AddingSecondRootDevice, + /// A root pmem device already exist + RootPmemDeviceAlreadyExist, + /// Unable to create the virtio-pmem device: {0} + CreateDevice(#[from] PmemError), + /// Error accessing underlying file: {0} + File(std::io::Error), +} + +/// Use this structure to setup a Pmem device before boothing the kernel. +#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct PmemConfig { + /// Unique identifier of the device. + pub id: String, + /// Path of the drive. + pub path_on_host: String, + /// Use this pmem device for rootfs + #[serde(default)] + pub root_device: bool, + /// Map the file as read only + #[serde(default)] + pub read_only: bool, +} + +/// Wrapper for the collection that holds all the Pmem devices. +#[derive(Debug, Default)] +pub struct PmemBuilder { + /// The list of pmem devices + pub devices: Vec>>, +} + +impl PmemBuilder { + /// Specifies whether there is a root block device already present in the list. + pub fn has_root_device(&self) -> bool { + self.devices + .iter() + .any(|d| d.lock().unwrap().config.root_device) + } + + /// Build a device from the config + pub fn build( + &mut self, + config: PmemConfig, + has_block_root: bool, + ) -> Result<(), PmemConfigError> { + if config.root_device && has_block_root { + return Err(PmemConfigError::AddingSecondRootDevice); + } + let position = self + .devices + .iter() + .position(|d| d.lock().unwrap().config.id == config.id); + if let Some(index) = position { + if !self.devices[index].lock().unwrap().config.root_device + && config.root_device + && self.has_root_device() + { + return Err(PmemConfigError::RootPmemDeviceAlreadyExist); + } + let pmem = Pmem::new(config)?; + let pmem = Arc::new(Mutex::new(pmem)); + self.devices[index] = pmem; + } else { + if config.root_device && self.has_root_device() { + return Err(PmemConfigError::RootPmemDeviceAlreadyExist); + } + let pmem = Pmem::new(config)?; + let pmem = Arc::new(Mutex::new(pmem)); + self.devices.push(pmem); + } + Ok(()) + } + + /// Adds an existing pmem device in the builder. + pub fn add_device(&mut self, device: Arc>) { + self.devices.push(device); + } + + /// Returns a vec with the structures used to configure the devices. + pub fn configs(&self) -> Vec { + self.devices + .iter() + .map(|b| b.lock().unwrap().config.clone()) + .collect() + } +} + +#[cfg(test)] +mod tests { + use vmm_sys_util::tempfile::TempFile; + + use super::*; + + #[test] + fn test_pmem_builder_build() { + let mut builder = PmemBuilder::default(); + + let dummy_file = TempFile::new().unwrap(); + dummy_file.as_file().set_len(Pmem::ALIGNMENT).unwrap(); + let dummy_path = dummy_file.as_path().to_str().unwrap().to_string(); + let mut config = PmemConfig { + id: "1".into(), + path_on_host: dummy_path, + root_device: true, + read_only: false, + }; + builder.build(config.clone(), false).unwrap(); + assert_eq!(builder.devices.len(), 1); + assert!(builder.has_root_device()); + + // First device got replaced with new one + config.root_device = false; + builder.build(config, false).unwrap(); + assert_eq!(builder.devices.len(), 1); + assert!(!builder.has_root_device()); + } + + #[test] + fn test_pmem_builder_build_seconde_root() { + let mut builder = PmemBuilder::default(); + + let dummy_file = TempFile::new().unwrap(); + dummy_file.as_file().set_len(Pmem::ALIGNMENT).unwrap(); + let dummy_path = dummy_file.as_path().to_str().unwrap().to_string(); + let mut config = PmemConfig { + id: "1".into(), + path_on_host: dummy_path, + root_device: true, + read_only: false, + }; + builder.build(config.clone(), false).unwrap(); + + config.id = "2".into(); + assert!(matches!( + builder.build(config.clone(), false).unwrap_err(), + PmemConfigError::RootPmemDeviceAlreadyExist, + )); + } + + #[test] + fn test_pmem_builder_build_root_with_block_already_a_root() { + let mut builder = PmemBuilder::default(); + + let dummy_file = TempFile::new().unwrap(); + dummy_file.as_file().set_len(Pmem::ALIGNMENT).unwrap(); + let dummy_path = dummy_file.as_path().to_str().unwrap().to_string(); + let config = PmemConfig { + id: "1".into(), + path_on_host: dummy_path, + root_device: true, + read_only: false, + }; + assert!(matches!( + builder.build(config, true).unwrap_err(), + PmemConfigError::AddingSecondRootDevice, + )); + } +} diff --git a/src/vmm/src/vstate/resources.rs b/src/vmm/src/vstate/resources.rs index 2f906460333..f5cc9b970e1 100644 --- a/src/vmm/src/vstate/resources.rs +++ b/src/vmm/src/vstate/resources.rs @@ -50,6 +50,8 @@ pub struct ResourceAllocator { pub mmio32_memory: AddressAllocator, /// Allocator for memory in the 64-bit MMIO address space pub mmio64_memory: AddressAllocator, + /// Allocator for memory after the 64-bit MMIO address space + pub past_mmio64_memory: AddressAllocator, /// Memory allocator for system data pub system_memory: AddressAllocator, } @@ -79,6 +81,11 @@ impl ResourceAllocator { arch::MEM_64BIT_DEVICES_SIZE, ) .unwrap(), + past_mmio64_memory: AddressAllocator::new( + arch::FIRST_ADDR_PAST_64BITS_MMIO, + arch::PAST_64BITS_MMIO_SIZE, + ) + .unwrap(), system_memory: AddressAllocator::new(arch::SYSTEM_MEM_START, arch::SYSTEM_MEM_SIZE) .unwrap(), } diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index 5099694cf54..f22dcbd38be 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -9,6 +9,7 @@ use std::collections::HashMap; use std::fs::OpenOptions; use std::io::Write; use std::path::Path; +use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Mutex, MutexGuard}; #[cfg(target_arch = "x86_64")] @@ -54,6 +55,7 @@ pub struct VmCommon { max_memslots: u32, /// The guest memory of this Vm. pub guest_memory: GuestMemoryMmap, + next_kvm_slot: AtomicU32, /// Interrupts used by Vm's devices pub interrupts: Mutex>, /// Allocator for VM resources @@ -133,6 +135,7 @@ impl Vm { fd, max_memslots: kvm.max_nr_memslots(), guest_memory: GuestMemoryMmap::default(), + next_kvm_slot: AtomicU32::new(0), interrupts: Mutex::new(HashMap::with_capacity(GSI_MSI_END as usize + 1)), resource_allocator: Mutex::new(ResourceAllocator::new()), mmio_bus: Arc::new(Bus::new()), @@ -159,6 +162,16 @@ impl Vm { Ok((vcpus, exit_evt)) } + /// Obtain the next free kvm slot id + pub fn next_kvm_slot(&self) -> Option { + let next = self.common.next_kvm_slot.fetch_add(1, Ordering::Relaxed); + if self.common.max_memslots <= next { + None + } else { + Some(next) + } + } + /// Register a list of new memory regions to this [`Vm`]. pub fn register_memory_regions( &mut self, @@ -174,13 +187,8 @@ impl Vm { /// Register a new memory region to this [`Vm`]. pub fn register_memory_region(&mut self, region: GuestRegionMmap) -> Result<(), VmError> { let next_slot = self - .guest_memory() - .num_regions() - .try_into() - .expect("Number of existing memory regions exceeds u32::MAX"); - if self.common.max_memslots <= next_slot { - return Err(VmError::NotEnoughMemorySlots(self.common.max_memslots)); - } + .next_kvm_slot() + .ok_or(VmError::NotEnoughMemorySlots(self.common.max_memslots))?; let flags = if region.bitmap().is_some() { KVM_MEM_LOG_DIRTY_PAGES diff --git a/tests/framework/http_api.py b/tests/framework/http_api.py index 16990a2a927..0ae2e279571 100644 --- a/tests/framework/http_api.py +++ b/tests/framework/http_api.py @@ -132,4 +132,5 @@ def __init__(self, api_usocket_full_name, *, on_error=None): self.snapshot_load = Resource(self, "/snapshot/load") self.cpu_config = Resource(self, "/cpu-config") self.entropy = Resource(self, "/entropy") + self.pmem = Resource(self, "/pmem", "id") self.serial = Resource(self, "/serial") diff --git a/tests/framework/microvm.py b/tests/framework/microvm.py index fa9dea79b82..74ae180950c 100644 --- a/tests/framework/microvm.py +++ b/tests/framework/microvm.py @@ -973,6 +973,24 @@ def add_net_iface(self, iface=None, api=True, **kwargs): return iface + def add_pmem( + self, + pmem_id, + path_on_host, + root_device=False, + read_only=False, + ): + """Add a pmem device.""" + + path_on_jail = self.create_jailed_resource(path_on_host) + self.api.pmem.put( + id=pmem_id, + path_on_host=path_on_jail, + root_device=root_device, + read_only=read_only, + ) + self.disks[pmem_id] = path_on_host + def start(self): """Start the microvm. diff --git a/tests/framework/vm_config.json b/tests/framework/vm_config.json index 6948002e245..ae3b4920444 100644 --- a/tests/framework/vm_config.json +++ b/tests/framework/vm_config.json @@ -31,5 +31,6 @@ "logger": null, "metrics": null, "mmds-config": null, - "entropy": null + "entropy": null, + "pmem": [] } diff --git a/tests/host_tools/fcmetrics.py b/tests/host_tools/fcmetrics.py index f340aa4aaf3..5b1343ffab7 100644 --- a/tests/host_tools/fcmetrics.py +++ b/tests/host_tools/fcmetrics.py @@ -223,6 +223,8 @@ def validate_fc_metrics(metrics): "mmds_fails", "vsock_count", "vsock_fails", + "pmem_count", + "pmem_fails", "serial_count", "serial_fails", ], @@ -293,6 +295,12 @@ def validate_fc_metrics(metrics): "rate_limiter_event_count", ], "interrupts": ["triggers", "config_updates"], + "pmem": [ + "activate_fails", + "cfg_fails", + "event_fails", + "queue_event_count", + ], } # validate timestamp before jsonschema validation which some more time diff --git a/tests/integration_tests/functional/test_api.py b/tests/integration_tests/functional/test_api.py index 32527e5c905..7dab0e14e6d 100644 --- a/tests/integration_tests/functional/test_api.py +++ b/tests/integration_tests/functional/test_api.py @@ -1044,6 +1044,67 @@ def test_api_balloon(uvm_nano): test_microvm.api.balloon.patch(amount_mib=33554432) +def test_pmem_api(uvm_plain_any, rootfs): + """ + Test virtio-pmem API commands + """ + + vm = uvm_plain_any + vm.spawn() + vm.basic_config(add_root_device=False) + + invalid_pmem_path_on_host = os.path.join(vm.fsfiles, "invalid_scratch") + utils.check_output(f"touch {invalid_pmem_path_on_host}") + invalid_pmem_file_path = vm.create_jailed_resource(str(invalid_pmem_path_on_host)) + + pmem_size_mb = 2 + pmem_path_on_host = drive_tools.FilesystemFile( + os.path.join(vm.fsfiles, "scratch"), size=pmem_size_mb + ) + pmem_file_path = vm.create_jailed_resource(pmem_path_on_host.path) + + # Try to add pmem without setting `path_on_host` + expected_msg = re.escape( + "An error occurred when deserializing the json body of a request: missing field `path_on_host`" + ) + with pytest.raises(RuntimeError, match=expected_msg): + vm.api.pmem.put(id="pmem") + + # Try to add pmem with 0 sized backing file + expected_msg = re.escape("Error backing file size is 0") + with pytest.raises(RuntimeError, match=expected_msg): + vm.api.pmem.put(id="pmem", path_on_host=invalid_pmem_file_path) + + # Try to add pmem as root while block is set as root + vm.api.drive.put(drive_id="drive", path_on_host=pmem_file_path, is_root_device=True) + expected_msg = re.escape( + "Attempt to add pmem as a root device while the root device defined as a block device" + ) + with pytest.raises(RuntimeError, match=expected_msg): + vm.api.pmem.put(id="pmem", path_on_host=pmem_file_path, root_device=True) + + # Reset block from being root + vm.api.drive.put( + drive_id="drive", path_on_host=pmem_file_path, is_root_device=False + ) + + # Try to add pmem as root twice + vm.api.pmem.put(id="pmem", path_on_host=pmem_file_path, root_device=True) + expected_msg = re.escape("A root pmem device already exist") + with pytest.raises(RuntimeError, match=expected_msg): + vm.api.pmem.put(id="pmem2", path_on_host=pmem_file_path, root_device=True) + + # Reset pmem from being root + vm.api.pmem.put(id="pmem", path_on_host=pmem_file_path, root_device=False) + + # Add a rootfs to boot a vm + vm.add_pmem("rootfs", rootfs, True, True) + + # No post boot API calls to pmem + with pytest.raises(RuntimeError): + vm.api.pmem.put(id="pmem") + + def test_get_full_config_after_restoring_snapshot(microvm_factory, uvm_nano): """ Test the configuration of a microVM after restoring from a snapshot. @@ -1085,6 +1146,21 @@ def test_get_full_config_after_restoring_snapshot(microvm_factory, uvm_nano): } ] + uvm_nano.api.pmem.put( + id="pmem", + path_on_host="/" + uvm_nano.rootfs_file.name, + root_device=False, + read_only=False, + ) + setup_cfg["pmem"] = [ + { + "id": "pmem", + "path_on_host": "/" + uvm_nano.rootfs_file.name, + "root_device": False, + "read_only": False, + } + ] + # Add a memory balloon device. uvm_nano.api.balloon.put(amount_mib=1, deflate_on_oom=True) setup_cfg["balloon"] = { @@ -1196,6 +1272,21 @@ def test_get_full_config(uvm_plain): } ] + test_microvm.api.pmem.put( + id="pmem", + path_on_host="/" + test_microvm.rootfs_file.name, + root_device=False, + read_only=False, + ) + expected_cfg["pmem"] = [ + { + "id": "pmem", + "path_on_host": "/" + test_microvm.rootfs_file.name, + "root_device": False, + "read_only": False, + } + ] + # Add a memory balloon device. test_microvm.api.balloon.put(amount_mib=1, deflate_on_oom=True) expected_cfg["balloon"] = { diff --git a/tests/integration_tests/functional/test_pmem.py b/tests/integration_tests/functional/test_pmem.py new file mode 100644 index 00000000000..8d7c3fda843 --- /dev/null +++ b/tests/integration_tests/functional/test_pmem.py @@ -0,0 +1,128 @@ +# Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Tests for the virtio-pmem device.""" + +import json +import os + +import host_tools.drive as drive_tools + +ALIGNMENT = 2 << 20 + + +def align(size: int) -> int: + """ + Align the value to ALIGNMENT + """ + return (size + ALIGNMENT - 1) & ~(ALIGNMENT - 1) + + +def check_pmem_exist(vm, index, root, read_only, size, extension): + """ + Check the pmem exist with correct parameters + """ + code, _, _ = vm.ssh.run(f"ls /dev/pmem{index}") + assert code == 0 + + if root: + code, stdout, _ = vm.ssh.run("mount") + assert code == 0 + if read_only: + assert f"/dev/pmem0 on / type {extension} (ro" in stdout + else: + assert f"/dev/pmem0 on / type {extension} (rw" in stdout + + code, stdout, _ = vm.ssh.run("lsblk -J") + assert code == 0 + + j = json.loads(stdout) + blocks = j["blockdevices"] + for block in blocks: + if block["name"] == f"pmem{index}": + assert block["size"][-1] == "M" + block_size_mb = int(block["size"][:-1]) + assert int(block_size_mb << 20) == size + if root: + assert "/" in block["mountpoints"] + return + assert False + + +def test_pmem_add(uvm_plain_any, microvm_factory): + """ + Test addition of a single non root pmem device + """ + + vm = uvm_plain_any + vm.spawn() + vm.basic_config(add_root_device=True) + vm.add_net_iface() + + # Pmem should work with non 2MB aligned files as well + pmem_size_mb_1 = 1 + fs_1 = drive_tools.FilesystemFile( + os.path.join(vm.fsfiles, "scratch_1"), size=pmem_size_mb_1 + ) + pmem_size_mb_2 = 2 + fs_2 = drive_tools.FilesystemFile( + os.path.join(vm.fsfiles, "scratch_2"), size=pmem_size_mb_2 + ) + vm.add_pmem("pmem_1", fs_1.path, False, False) + vm.add_pmem("pmem_2", fs_2.path, False, True) + vm.start() + + # Both 1MB and 2MB block will show as 2MB because of + # the aligment + check_pmem_exist(vm, 0, False, False, align(pmem_size_mb_1 << 20), "ext4") + check_pmem_exist(vm, 1, False, True, align(pmem_size_mb_2 << 20), "ext4") + + snapshot = vm.snapshot_full() + restored_vm = microvm_factory.build_from_snapshot(snapshot) + check_pmem_exist(restored_vm, 0, False, False, align(pmem_size_mb_1 << 20), "ext4") + check_pmem_exist(restored_vm, 1, False, True, align(pmem_size_mb_2 << 20), "ext4") + + +def test_pmem_add_as_root_rw(uvm_plain_any, rootfs_rw, microvm_factory): + """ + Test addition of a single root pmem device in read-write mode + """ + + vm = uvm_plain_any + vm.memory_monitor = None + vm.monitors = [] + vm.spawn() + vm.basic_config(add_root_device=False) + vm.add_net_iface() + + rootfs_size = os.path.getsize(rootfs_rw) + vm.add_pmem("pmem", rootfs_rw, True, False) + vm.start() + + check_pmem_exist(vm, 0, True, False, align(rootfs_size), "ext4") + + snapshot = vm.snapshot_full() + restored_vm = microvm_factory.build_from_snapshot(snapshot) + check_pmem_exist(restored_vm, 0, True, False, align(rootfs_size), "ext4") + + +def test_pmem_add_as_root_ro(uvm_plain_any, rootfs, microvm_factory): + """ + Test addition of a single root pmem device in read-only mode + """ + + vm = uvm_plain_any + vm.memory_monitor = None + vm.monitors = [] + vm.spawn() + vm.basic_config(add_root_device=False) + vm.add_net_iface() + + rootfs_size = os.path.getsize(rootfs) + vm.add_pmem("pmem", rootfs, True, True) + vm.start() + + check_pmem_exist(vm, 0, True, True, align(rootfs_size), "squashfs") + + snapshot = vm.snapshot_full() + restored_vm = microvm_factory.build_from_snapshot(snapshot) + check_pmem_exist(restored_vm, 0, True, True, align(rootfs_size), "squashfs")