diff --git a/.cargo/config b/.cargo/config index ef7728c6c98..0678c0723c6 100644 --- a/.cargo/config +++ b/.cargo/config @@ -1,6 +1,6 @@ [build] -target = "x86_64-unknown-linux-musl" -target-dir = "build/cargo_target" +# target = "x86_64-unknown-linux-musl" +# target-dir = "build/cargo_target" [net] git-fetch-with-cli = true diff --git a/Cargo.lock b/Cargo.lock index 338b2b1a74e..4ccc0cc02a4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,7 +17,7 @@ version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e8b47f52ea9bae42228d07ec09eb676433d7c4ed1ebdf0f1d1c29ed446f1ab8" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "cipher", "cpufeatures", "opaque-debug", @@ -111,9 +111,9 @@ dependencies = [ [[package]] name = "bindgen" -version = "0.59.2" +version = "0.60.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2bd2a9a458e8f4304c52c43ebb0cfbd520289f8379a52e329a38afda99bf8eb8" +checksum = "062dddbc1ba4aca46de6338e2bf87771414c335f7b2f2036e8f3e9befebf88e6" dependencies = [ "bitflags", "cexpr", @@ -182,12 +182,6 @@ dependencies = [ "nom", ] -[[package]] -name = "cfg-if" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" - [[package]] name = "cfg-if" version = "1.0.0" @@ -292,7 +286,7 @@ version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5aaa7bd5fb665c6864b5f963dd9097905c54125909c7aa94c9e18507cdbe6c53" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "crossbeam-utils", ] @@ -302,7 +296,7 @@ version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "crossbeam-epoch", "crossbeam-utils", ] @@ -313,7 +307,7 @@ version = "0.9.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c00d6d2ea26e8b151d99093005cb442fb9a37aeaca582a03ec70946f49ab5ed9" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "crossbeam-utils", "lazy_static", "memoffset", @@ -326,7 +320,7 @@ version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5e5bed1f1c269533fa816a0a5492b3545209a205ca1a54842be180eb63a16a6" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "lazy_static", ] @@ -464,7 +458,7 @@ version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "418d37c8b1d42553c93648be529cb70f920d3baf8ef469b74b9638df426e0b4c" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "libc", "wasi", ] @@ -595,7 +589,7 @@ version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "efbc0f03f9a775e9f6aed295c6a1ba2253c5757a9e03d55c6caa46a681abcddd" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "winapi", ] @@ -614,7 +608,7 @@ version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", ] [[package]] @@ -691,7 +685,7 @@ checksum = "9f866317acbd3a240710c63f065ffb1e4fd466259045ccb504130b7f668f35c6" dependencies = [ "bitflags", "cc", - "cfg-if 1.0.0", + "cfg-if", "libc", "memoffset", ] @@ -778,7 +772,7 @@ version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8419d2b623c7c0896ff2d5d96e2cb4ede590fed28fcc34934f4c33c036e620a1" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "cpufeatures", "opaque-debug", "universal-hash", @@ -1142,12 +1136,11 @@ dependencies = [ [[package]] name = "userfaultfd" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b738009e099b4ded1ecf19dfb7631f69c24f16e0af6d29fd9b3f54a092aca46" +version = "0.5.0" +source = "git+https://github.com/codesandbox/userfaultfd-rs.git?rev=3bafb90a85d8f2aa664dd391a5ae63ad1f43e3f5#3bafb90a85d8f2aa664dd391a5ae63ad1f43e3f5" dependencies = [ "bitflags", - "cfg-if 1.0.0", + "cfg-if", "libc", "nix", "thiserror", @@ -1156,13 +1149,12 @@ dependencies = [ [[package]] name = "userfaultfd-sys" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a4be003c705d2c8dc1234d473856945e291bb998ac2e2d83e70328d964d7458" +version = "0.4.2" +source = "git+https://github.com/codesandbox/userfaultfd-rs.git?rev=3bafb90a85d8f2aa664dd391a5ae63ad1f43e3f5#3bafb90a85d8f2aa664dd391a5ae63ad1f43e3f5" dependencies = [ "bindgen", "cc", - "cfg-if 0.1.10", + "cfg-if", ] [[package]] @@ -1321,7 +1313,7 @@ version = "0.2.79" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "25f1af7423d8588a3d840681122e72e6a24ddbcb3f0ec385cac0d12d24256c06" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "wasm-bindgen-macro", ] diff --git a/docs/mmds/mmds-user-guide.md b/docs/mmds/mmds-user-guide.md index c6bac05100e..5f9b3f012d0 100644 --- a/docs/mmds/mmds-user-guide.md +++ b/docs/mmds/mmds-user-guide.md @@ -249,7 +249,7 @@ The session must start with an HTTP `PUT` request that generates the session tok In order to be successful, the request must respect the following constraints: - must be directed towards `/latest/api/token` path -- must contain a `X-ametadata-token-ttl-seconds` header specifying the token lifetime +- must contain a `X-metadata-token-ttl-seconds` header specifying the token lifetime in seconds. The value cannot be lower than 1 or greater than 21600 (6 hours). - must not contain a `X-Forwarded-For` header. diff --git a/resources/seccomp/aarch64-unknown-linux-musl.json b/resources/seccomp/aarch64-unknown-linux-musl.json index ca0e51381a5..5a2204f41d6 100644 --- a/resources/seccomp/aarch64-unknown-linux-musl.json +++ b/resources/seccomp/aarch64-unknown-linux-musl.json @@ -596,6 +596,19 @@ } ] }, + { + "syscall": "msync", + "comment": "Used to sync memory from mmap to disk", + "args": [ + { + "index": 2, + "type": "dword", + "op": "eq", + "val": 4, + "comment": "MS_SYNC" + } + ] + }, { "syscall": "rt_sigaction", "comment": "rt_sigaction is used by libc::abort during a panic to install the default handler for SIGABRT", diff --git a/resources/seccomp/x86_64-unknown-linux-musl.json b/resources/seccomp/x86_64-unknown-linux-musl.json index b419581ea3f..51c90224393 100644 --- a/resources/seccomp/x86_64-unknown-linux-musl.json +++ b/resources/seccomp/x86_64-unknown-linux-musl.json @@ -248,6 +248,31 @@ } ] }, + { + "syscall": "msync", + "comment": "Used to sync memory from mmap to disk", + "args": [ + { + "index": 2, + "type": "dword", + "op": "eq", + "val": 4, + "comment": "MS_SYNC" + } + ] + }, + { + "syscall": "memfd_create", + "comment": "Used to create a memory backed file descriptor that can be used to save memory to" + }, + { + "syscall": "nanosleep", + "comment": "Debugging sleep" + }, + { + "syscall": "copy_file_range", + "comment": "debugging" + }, { "syscall": "rt_sigaction", "comment": "rt_sigaction is used by libc::abort during a panic to install the default handler for SIGABRT", diff --git a/src/api_server/src/parsed_request.rs b/src/api_server/src/parsed_request.rs index 9b4cd6e64b9..b81e3391b0e 100644 --- a/src/api_server/src/parsed_request.rs +++ b/src/api_server/src/parsed_request.rs @@ -17,6 +17,7 @@ use crate::request::logger::parse_put_logger; use crate::request::machine_configuration::{ parse_get_machine_config, parse_patch_machine_config, parse_put_machine_config, }; +use crate::request::memory_backend::parse_put_memory_backend; use crate::request::metrics::parse_put_metrics; use crate::request::mmds::{parse_get_mmds, parse_patch_mmds, parse_put_mmds}; use crate::request::net::{parse_patch_net, parse_put_net}; @@ -112,6 +113,7 @@ impl ParsedRequest { (Method::Put, "network-interfaces", Some(body)) => { parse_put_net(body, path_tokens.get(1)) } + (Method::Put, "memory-backend", Some(body)) => parse_put_memory_backend(body), (Method::Put, "shutdown-internal", None) => { Ok(ParsedRequest::new(RequestAction::ShutdownInternal)) } diff --git a/src/api_server/src/request/memory_backend.rs b/src/api_server/src/request/memory_backend.rs new file mode 100644 index 00000000000..b81c7f5fc78 --- /dev/null +++ b/src/api_server/src/request/memory_backend.rs @@ -0,0 +1,46 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use super::super::VmmAction; +use crate::parsed_request::{Error, ParsedRequest}; +use crate::request::Body; +use logger::{IncMetric, METRICS}; +use vmm::vmm_config::snapshot::MemBackendConfig; + +pub(crate) fn parse_put_memory_backend(body: &Body) -> Result { + METRICS.put_api_requests.memory_backend_cfg_count.inc(); + Ok(ParsedRequest::new_sync(VmmAction::SetMemoryBackend( + serde_json::from_slice::(body.raw()).map_err(|e| { + METRICS.put_api_requests.memory_backend_cfg_fails.inc(); + Error::SerdeJson(e) + })?, + ))) +} + +#[cfg(test)] +mod tests { + use std::path::PathBuf; + + use vmm::vmm_config::snapshot::MemBackendType; + + use super::*; + + #[test] + fn test_parse_memory_backing_file() { + assert!(parse_put_memory_backend(&Body::new("invalid_payload")).is_err()); + + let body = r#"{ + "backend_type": "File", + "backend_path": "./memory.snap" + }"#; + let same_body = MemBackendConfig { + backend_type: MemBackendType::File, + backend_path: PathBuf::from("./memory.snap"), + }; + let result = parse_put_memory_backend(&Body::new(body)); + assert!(result.is_ok()); + let parsed_req = result.unwrap_or_else(|_e| panic!("Failed test.")); + + assert!(parsed_req == ParsedRequest::new_sync(VmmAction::SetMemoryBackend(same_body))); + } +} diff --git a/src/api_server/src/request/mod.rs b/src/api_server/src/request/mod.rs index 75f9a0daef3..f58bce5b533 100644 --- a/src/api_server/src/request/mod.rs +++ b/src/api_server/src/request/mod.rs @@ -8,6 +8,7 @@ pub mod drive; pub mod instance_info; pub mod logger; pub mod machine_configuration; +pub mod memory_backend; pub mod metrics; pub mod mmds; pub mod net; diff --git a/src/api_server/swagger/firecracker.yaml b/src/api_server/swagger/firecracker.yaml index 07d242532e6..88f0d2697a5 100644 --- a/src/api_server/swagger/firecracker.yaml +++ b/src/api_server/swagger/firecracker.yaml @@ -350,6 +350,29 @@ paths: description: Internal server error schema: $ref: "#/definitions/Error" + + /memory-backend: + put: + summary: Configures a memory backend to sync the memory changes from during the runtime of the vm + operationId: putMemoryBackend + parameters: + - name: body + in: body + description: The memory backend to use + required: true + schema: + $ref: "#/definitions/MemoryBackend" + responses: + 204: + description: Memory backend configured + 400: + description: Memory backend failed + schema: + $ref: "#/definitions/Error" + default: + description: Internal server error. + schema: + $ref: "#/definitions/Error" /metrics: put: diff --git a/src/cpuid/src/transformer/amd.rs b/src/cpuid/src/transformer/amd.rs index c1db38260a5..8fc6222f2d9 100644 --- a/src/cpuid/src/transformer/amd.rs +++ b/src/cpuid/src/transformer/amd.rs @@ -147,6 +147,8 @@ impl CpuidTransformer for AmdCpuidTransformer { leaf_0x8000001d::LEAF_NUM => Some(amd::update_extended_cache_topology_entry), leaf_0x8000001e::LEAF_NUM => Some(amd::update_extended_apic_id_entry), 0x8000_0002..=0x8000_0004 => Some(common::update_brand_string_entry), + // Disable async PF, as it hangs the VM for some reason when loading from snapshot/uffd. + 0x4000_0001 => Some(common::disable_kvm_feature_async_pf), _ => None, } } diff --git a/src/cpuid/src/transformer/common.rs b/src/cpuid/src/transformer/common.rs index ba89fc35f74..ea2592a07da 100644 --- a/src/cpuid/src/transformer/common.rs +++ b/src/cpuid/src/transformer/common.rs @@ -69,6 +69,19 @@ pub fn update_brand_string_entry( Ok(()) } +// KVM feature bits +#[cfg(target_arch = "x86_64")] +const KVM_FEATURE_ASYNC_PF_INT_BIT: u32 = 14; + +pub fn disable_kvm_feature_async_pf( + entry: &mut kvm_cpuid_entry2, + _vm_spec: &VmSpec, +) -> Result<(), Error> { + entry.eax.write_bit(KVM_FEATURE_ASYNC_PF_INT_BIT, false); + + Ok(()) +} + pub fn update_cache_parameters_entry( entry: &mut kvm_cpuid_entry2, vm_spec: &VmSpec, diff --git a/src/cpuid/src/transformer/intel.rs b/src/cpuid/src/transformer/intel.rs index 8505b668932..db897b5ab31 100644 --- a/src/cpuid/src/transformer/intel.rs +++ b/src/cpuid/src/transformer/intel.rs @@ -126,6 +126,8 @@ impl CpuidTransformer for IntelCpuidTransformer { leaf_0xa::LEAF_NUM => Some(intel::update_perf_mon_entry), leaf_0xb::LEAF_NUM => Some(intel::update_extended_topology_entry), 0x8000_0002..=0x8000_0004 => Some(common::update_brand_string_entry), + // Disable async PF, as it hangs the VM for some reason when loading from snapshot/uffd. + 0x4000_0001 => Some(common::disable_kvm_feature_async_pf), _ => None, } } diff --git a/src/devices/src/virtio/balloon/device.rs b/src/devices/src/virtio/balloon/device.rs index 31af2a49af0..ae185dd9f66 100644 --- a/src/devices/src/virtio/balloon/device.rs +++ b/src/devices/src/virtio/balloon/device.rs @@ -18,12 +18,13 @@ use vm_memory::{Address, ByteValued, Bytes, GuestAddress, GuestMemoryMmap}; use super::super::{ActivateResult, DeviceState, Queue, VirtioDevice, TYPE_BALLOON}; use super::utils::{compact_page_frame_numbers, remove_range}; use super::{ - BALLOON_DEV_ID, DEFLATE_INDEX, INFLATE_INDEX, MAX_PAGES_IN_DESC, MAX_PAGE_COMPACT_BUFFER, - MIB_TO_4K_PAGES, NUM_QUEUES, QUEUE_SIZES, STATS_INDEX, VIRTIO_BALLOON_F_DEFLATE_ON_OOM, - VIRTIO_BALLOON_F_STATS_VQ, VIRTIO_BALLOON_PFN_SHIFT, VIRTIO_BALLOON_S_AVAIL, - VIRTIO_BALLOON_S_CACHES, VIRTIO_BALLOON_S_HTLB_PGALLOC, VIRTIO_BALLOON_S_HTLB_PGFAIL, - VIRTIO_BALLOON_S_MAJFLT, VIRTIO_BALLOON_S_MEMFREE, VIRTIO_BALLOON_S_MEMTOT, - VIRTIO_BALLOON_S_MINFLT, VIRTIO_BALLOON_S_SWAP_IN, VIRTIO_BALLOON_S_SWAP_OUT, + BALLOON_DEV_ID, DEFLATE_INDEX, FREE_PAGE_REPORTING_INDEX, INFLATE_INDEX, MAX_PAGES_IN_DESC, + MAX_PAGE_COMPACT_BUFFER, MIB_TO_4K_PAGES, NUM_QUEUES, QUEUE_SIZES, STATS_INDEX, + VIRTIO_BALLOON_F_DEFLATE_ON_OOM, VIRTIO_BALLOON_F_REPORTING, VIRTIO_BALLOON_F_STATS_VQ, + VIRTIO_BALLOON_PFN_SHIFT, VIRTIO_BALLOON_S_AVAIL, VIRTIO_BALLOON_S_CACHES, + VIRTIO_BALLOON_S_HTLB_PGALLOC, VIRTIO_BALLOON_S_HTLB_PGFAIL, VIRTIO_BALLOON_S_MAJFLT, + VIRTIO_BALLOON_S_MEMFREE, VIRTIO_BALLOON_S_MEMTOT, VIRTIO_BALLOON_S_MINFLT, + VIRTIO_BALLOON_S_SWAP_IN, VIRTIO_BALLOON_S_SWAP_OUT, }; use crate::virtio::balloon::Error as BalloonError; use crate::virtio::{IrqTrigger, IrqType}; @@ -167,10 +168,15 @@ impl Balloon { avail_features |= 1u64 << VIRTIO_BALLOON_F_STATS_VQ; } + avail_features |= 1u64 << VIRTIO_BALLOON_F_REPORTING; + + logger::debug!("balloon: registering balloon device"); + let queue_evts = [ EventFd::new(libc::EFD_NONBLOCK).map_err(BalloonError::EventFd)?, EventFd::new(libc::EFD_NONBLOCK).map_err(BalloonError::EventFd)?, EventFd::new(libc::EFD_NONBLOCK).map_err(BalloonError::EventFd)?, + EventFd::new(libc::EFD_NONBLOCK).map_err(BalloonError::EventFd)?, ]; let mut queues: Vec = QUEUE_SIZES.iter().map(|&s| Queue::new(s)).collect(); @@ -231,6 +237,14 @@ impl Balloon { self.trigger_stats_update() } + pub(crate) fn process_free_page_report_event(&mut self) -> Result<(), BalloonError> { + logger::debug!("balloon: received free page report event"); + self.queue_evts[FREE_PAGE_REPORTING_INDEX] + .read() + .map_err(BalloonError::EventFd)?; + self.process_free_page_reporting_queue() + } + pub(crate) fn process_inflate(&mut self) -> Result<(), BalloonError> { // This is safe since we checked in the event handler that the device is activated. let mem = self.device_state.mem().unwrap(); @@ -382,6 +396,50 @@ impl Balloon { Ok(()) } + pub(crate) fn process_free_page_reporting_queue( + &mut self, + ) -> std::result::Result<(), BalloonError> { + logger::debug!("balloon: processing free page reporting queue"); + let mem = self.device_state.mem().unwrap(); + + let mut total_removed = 0; + let queue = &mut self.queues[FREE_PAGE_REPORTING_INDEX]; + let mut needs_interrupt = false; + + while let Some(head) = queue.pop(mem) { + let head_index = head.index; + let head_mem = head.mem; + + let mut last_desc = Some(head); + while let Some(desc) = last_desc { + total_removed += desc.len; + if let Err(err) = + remove_range(desc.mem, (desc.addr, desc.len as u64), self.restored) + { + error!("balloon: failed to remove range: {:?}", err); + }; + last_desc = desc.next_descriptor(); + } + + // Acknowledge the receipt of the descriptor. + queue + .add_used(head_mem, head_index, 0) + .map_err(BalloonError::Queue)?; + + logger::debug!("balloon: adding to the queue"); + + needs_interrupt = true; + } + + logger::debug!("balloon: total removed: {}MiB", total_removed >> 20); + + if needs_interrupt { + self.signal_used_queue()?; + } + + Ok(()) + } + pub(crate) fn signal_used_queue(&self) -> Result<(), BalloonError> { self.irq_trigger.trigger_irq(IrqType::Vring).map_err(|err| { METRICS.balloon.event_fails.inc(); @@ -393,6 +451,7 @@ impl Balloon { pub fn process_virtio_queues(&mut self) { let _ = self.process_inflate(); let _ = self.process_deflate_queue(); + let _ = self.process_free_page_reporting_queue(); } pub fn id(&self) -> &str { diff --git a/src/devices/src/virtio/balloon/event_handler.rs b/src/devices/src/virtio/balloon/event_handler.rs index 841f5d231db..f2ad8678232 100644 --- a/src/devices/src/virtio/balloon/event_handler.rs +++ b/src/devices/src/virtio/balloon/event_handler.rs @@ -9,7 +9,9 @@ use utils::epoll::EventSet; use crate::report_balloon_event_fail; use crate::virtio::balloon::device::Balloon; -use crate::virtio::{VirtioDevice, DEFLATE_INDEX, INFLATE_INDEX, STATS_INDEX}; +use crate::virtio::{ + VirtioDevice, DEFLATE_INDEX, FREE_PAGE_REPORTING_INDEX, INFLATE_INDEX, STATS_INDEX, +}; impl Balloon { fn register_runtime_events(&self, ops: &mut EventOps) { @@ -27,6 +29,15 @@ impl Balloon { error!("Failed to register stats timerfd event: {}", err); } } + if let Err(err) = ops.add(Events::new( + &self.queue_evts[FREE_PAGE_REPORTING_INDEX], + EventSet::IN, + )) { + error!( + "Failed to register free page reporting queue event: {}", + err + ); + } } fn register_activate_event(&self, ops: &mut EventOps) { @@ -65,6 +76,7 @@ impl MutEventSubscriber for Balloon { let virtq_inflate_ev_fd = self.queue_evts[INFLATE_INDEX].as_raw_fd(); let virtq_deflate_ev_fd = self.queue_evts[DEFLATE_INDEX].as_raw_fd(); let virtq_stats_ev_fd = self.queue_evts[STATS_INDEX].as_raw_fd(); + let free_page_report_ev_fd = self.queue_evts[FREE_PAGE_REPORTING_INDEX].as_raw_fd(); let stats_timer_fd = self.stats_timer.as_raw_fd(); let activate_fd = self.activate_evt.as_raw_fd(); @@ -82,6 +94,9 @@ impl MutEventSubscriber for Balloon { _ if source == stats_timer_fd => self .process_stats_timer_event() .unwrap_or_else(report_balloon_event_fail), + _ if source == free_page_report_ev_fd => self + .process_free_page_report_event() + .unwrap_or_else(report_balloon_event_fail), _ if activate_fd == source => self.process_activate_event(ops), _ => { warn!("Balloon: Spurious event received: {:?}", source); diff --git a/src/devices/src/virtio/balloon/mod.rs b/src/devices/src/virtio/balloon/mod.rs index fb614c9ca6e..c6f8b619cd3 100644 --- a/src/devices/src/virtio/balloon/mod.rs +++ b/src/devices/src/virtio/balloon/mod.rs @@ -17,8 +17,9 @@ pub use self::event_handler::*; pub const BALLOON_DEV_ID: &str = "balloon"; pub const CONFIG_SPACE_SIZE: usize = 8; pub const QUEUE_SIZE: u16 = 256; -pub const NUM_QUEUES: usize = 3; -pub const QUEUE_SIZES: &[u16] = &[QUEUE_SIZE, QUEUE_SIZE, QUEUE_SIZE]; +pub const REPORTING_QUEUE_SIZE: u16 = 32; +pub const NUM_QUEUES: usize = 4; +pub const QUEUE_SIZES: &[u16] = &[QUEUE_SIZE, QUEUE_SIZE, QUEUE_SIZE, QUEUE_SIZE]; // Number of 4K pages in a MiB. pub const MIB_TO_4K_PAGES: u32 = 256; // The maximum number of pages that can be received in a single descriptor. @@ -34,10 +35,13 @@ pub const INFLATE_INDEX: usize = 0; pub const DEFLATE_INDEX: usize = 1; // The index of the deflate queue from Balloon device queues/queues_evts vector. pub const STATS_INDEX: usize = 2; +// The index of the free page reporting from Balloon device queues/queues_evts vector. +pub const FREE_PAGE_REPORTING_INDEX: usize = 3; // The feature bitmap for virtio balloon. const VIRTIO_BALLOON_F_STATS_VQ: u32 = 1; // Enable statistics. const VIRTIO_BALLOON_F_DEFLATE_ON_OOM: u32 = 2; // Deflate balloon on OOM. +const VIRTIO_BALLOON_F_REPORTING: u32 = 5; // Page reporting virtqueue // The statistics tags. const VIRTIO_BALLOON_S_SWAP_IN: u16 = 0; diff --git a/src/devices/src/virtio/balloon/utils.rs b/src/devices/src/virtio/balloon/utils.rs index 53ec5d3efd0..2a3ed7af56c 100644 --- a/src/devices/src/virtio/balloon/utils.rs +++ b/src/devices/src/virtio/balloon/utils.rs @@ -68,7 +68,7 @@ pub(crate) fn compact_page_frame_numbers(v: &mut [u32]) -> Vec<(u32, u32)> { pub(crate) fn remove_range( guest_memory: &GuestMemoryMmap, range: (GuestAddress, u64), - restored: bool, + _restored: bool, ) -> std::result::Result<(), RemoveRegionError> { let (guest_address, range_len) = range; @@ -80,24 +80,26 @@ pub(crate) fn remove_range( .get_host_address(guest_address) .map_err(|_| RemoveRegionError::AddressTranslation)?; - // Mmap a new anonymous region over the present one in order to create a hole. - // This workaround is (only) needed after resuming from a snapshot because the guest memory - // is mmaped from file as private and there is no `madvise` flag that works for this case. - if restored { - let ret = unsafe { - libc::mmap( - phys_address as *mut _, - range_len as usize, - libc::PROT_READ | libc::PROT_WRITE, - libc::MAP_FIXED | libc::MAP_ANONYMOUS | libc::MAP_PRIVATE, - -1, - 0, - ) - }; - if ret == libc::MAP_FAILED { - return Err(RemoveRegionError::MmapFail(io::Error::last_os_error())); - } - }; + // CodeSandbox: since we use UFFD handler, this is not needed for us. In fact, it breaks the UFFD handler + // if this happens right now, as it unregisters the UFFD handler for the given range. + // // Mmap a new anonymous region over the present one in order to create a hole. + // // This workaround is (only) needed after resuming from a snapshot because the guest memory + // // is mmaped from file as private and there is no `madvise` flag that works for this case. + // if restored { + // let ret = unsafe { + // libc::mmap( + // phys_address as *mut _, + // range_len as usize, + // libc::PROT_READ | libc::PROT_WRITE, + // libc::MAP_FIXED | libc::MAP_ANONYMOUS | libc::MAP_PRIVATE, + // -1, + // 0, + // ) + // }; + // if ret == libc::MAP_FAILED { + // return Err(RemoveRegionError::MmapFail(io::Error::last_os_error())); + // } + // }; // Madvise the region in order to mark it as not used. let ret = unsafe { diff --git a/src/firecracker/src/api_server_adapter.rs b/src/firecracker/src/api_server_adapter.rs index 7934530e1c4..cd5725eedfb 100644 --- a/src/firecracker/src/api_server_adapter.rs +++ b/src/firecracker/src/api_server_adapter.rs @@ -71,6 +71,7 @@ impl MutEventSubscriber for ApiServerAdapter { let event_set = event.event_set(); if source == self.api_event_fd.as_raw_fd() && event_set == EventSet::IN { + let _ = self.api_event_fd.read(); match self.from_api.try_recv() { Ok(api_request) => { let request_is_pause = *api_request == VmmAction::Pause; @@ -101,7 +102,6 @@ impl MutEventSubscriber for ApiServerAdapter { panic!("The channel's sending half was disconnected. Cannot receive data."); } }; - let _ = self.api_event_fd.read(); } else { error!("Spurious EventManager event for handler: ApiServerAdapter"); } @@ -129,7 +129,7 @@ pub(crate) fn run_with_api( // FD to notify of API events. This is a blocking eventfd by design. // It is used in the config/pre-boot loop which is a simple blocking loop // which only consumes API events. - let api_event_fd = EventFd::new(0).expect("Cannot create API Eventfd."); + let api_event_fd = EventFd::new(libc::EFD_SEMAPHORE).expect("Cannot create API Eventfd."); // Channels for both directions between Vmm and Api threads. let (to_vmm, from_api) = channel(); diff --git a/src/jailer/src/env.rs b/src/jailer/src/env.rs index 405b74e4337..bd8c39219b2 100644 --- a/src/jailer/src/env.rs +++ b/src/jailer/src/env.rs @@ -371,9 +371,8 @@ impl Env { // a new PathBuf, with something like chroot_dir.join(exec_file_name) ?! self.chroot_dir.push(exec_file_name); - // TODO: hard link instead of copy? This would save up disk space, but hard linking is - // not always possible :( - fs::copy(&self.exec_file_path, &self.chroot_dir).map_err(|err| { + // We hard link instead of copy for space savings and to retain the capabilities + fs::hard_link(&self.exec_file_path, &self.chroot_dir).map_err(|err| { Error::Copy(self.exec_file_path.clone(), self.chroot_dir.clone(), err) })?; diff --git a/src/logger/src/metrics.rs b/src/logger/src/metrics.rs index 779689f04b8..f009004362f 100644 --- a/src/logger/src/metrics.rs +++ b/src/logger/src/metrics.rs @@ -403,6 +403,10 @@ pub struct PutRequestsMetrics { pub machine_cfg_count: SharedIncMetric, /// Number of failures in configuring the machine. pub machine_cfg_fails: SharedIncMetric, + /// Number of PUTs for setting memory backing file. + pub memory_backend_cfg_count: SharedIncMetric, + /// Number of failures in configuring the machine. + pub memory_backend_cfg_fails: SharedIncMetric, /// Number of PUTs for initializing the metrics system. pub metrics_count: SharedIncMetric, /// Number of failures in initializing the metrics system. diff --git a/src/mmds/src/lib.rs b/src/mmds/src/lib.rs index 5360c5a0f0a..ae206ab7d6e 100644 --- a/src/mmds/src/lib.rs +++ b/src/mmds/src/lib.rs @@ -42,7 +42,7 @@ impl fmt::Display for Error { ), Error::NoTtlProvided => write!( f, - "Token time to live value not found. Use `X-metadata-token-ttl_seconds` header to \ + "Token time to live value not found. Use `X-metadata-token-ttl-seconds` header to \ specify the token's lifetime." ), Error::ResourceNotFound(ref uri) => { @@ -705,8 +705,8 @@ mod tests { assert_eq!( Error::NoTtlProvided.to_string(), - "Token time to live value not found. Use `X-metadata-token-ttl_seconds` header to \ - specify the token's lifetime." + "Token time to live value not found. Use `X-metadata-token-ttl-seconds` header to \ + specify the token's lifetime." ); assert_eq!( diff --git a/src/mmds/src/token_headers.rs b/src/mmds/src/token_headers.rs index 5d13d9a4245..1f03d0b4f4e 100644 --- a/src/mmds/src/token_headers.rs +++ b/src/mmds/src/token_headers.rs @@ -39,12 +39,19 @@ impl TokenHeaders { /// Return `TokenHeaders` from headers map. pub fn try_from(map: &HashMap) -> Result { let mut headers = Self::default(); + let lowercased_headers: HashMap = map + .iter() + .map(|(k, v)| (k.to_lowercase(), v.clone())) + .collect(); - if let Some(token) = map.get(TokenHeaders::X_METADATA_TOKEN) { + if let Some(token) = lowercased_headers.get(&TokenHeaders::X_METADATA_TOKEN.to_lowercase()) + { headers.x_metadata_token = Some(token.to_string()); } - if let Some(value) = map.get(TokenHeaders::X_METADATA_TOKEN_TTL_SECONDS) { + if let Some(value) = + lowercased_headers.get(&TokenHeaders::X_METADATA_TOKEN_TTL_SECONDS.to_lowercase()) + { match value.parse::() { Ok(seconds) => { headers.x_metadata_token_ttl_seconds = Some(seconds); @@ -127,6 +134,17 @@ mod tests { let headers = TokenHeaders::try_from(&map).unwrap(); assert_eq!(*headers.x_metadata_token().unwrap(), "".to_string()); + // Lowercased headers + let mut map: HashMap = HashMap::default(); + map.insert( + TokenHeaders::X_METADATA_TOKEN_TTL_SECONDS + .to_string() + .to_lowercase(), + "60".to_string(), + ); + let headers = TokenHeaders::try_from(&map).unwrap(); + assert_eq!(headers.x_metadata_token_ttl_seconds().unwrap(), 60); + // Invalid value. let mut map: HashMap = HashMap::default(); map.insert( diff --git a/src/snapshot/src/lib.rs b/src/snapshot/src/lib.rs index 1e4c476391d..5a43bcc8560 100644 --- a/src/snapshot/src/lib.rs +++ b/src/snapshot/src/lib.rs @@ -219,9 +219,10 @@ impl Snapshot { object .serialize(&mut writer, &self.version_map, self.target_version) .map_err(Error::Versionize)?; - writer - .flush() - .map_err(|ref err| Error::Io(err.raw_os_error().unwrap_or(libc::EINVAL))) + // writer + // .flush() + // .map_err(|ref err| Error::Io(err.raw_os_error().unwrap_or(libc::EINVAL))) + Ok(()) } // Returns the current snapshot format version. diff --git a/src/vm-memory/src/lib.rs b/src/vm-memory/src/lib.rs index 814f79098b4..1d18139dd99 100644 --- a/src/vm-memory/src/lib.rs +++ b/src/vm-memory/src/lib.rs @@ -117,7 +117,7 @@ pub fn create_guest_memory( for region in regions { let flags = match region.0 { None => libc::MAP_NORESERVE | libc::MAP_PRIVATE | libc::MAP_ANONYMOUS, - Some(_) => libc::MAP_NORESERVE | libc::MAP_PRIVATE, + Some(_) => libc::MAP_NORESERVE | libc::MAP_SHARED, }; let mmap_region = diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index 68a64a4d23c..50a5bd78232 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -14,12 +14,16 @@ libc = ">=0.2.39" linux-loader = ">=0.4.0" serde = { version = ">=1.0.27", features = ["derive"] } serde_json = ">=1.0.9" -userfaultfd = ">=0.4.0" +userfaultfd = { git = "https://github.com/codesandbox/userfaultfd-rs.git", rev = "3bafb90a85d8f2aa664dd391a5ae63ad1f43e3f5", features = [ + "linux5_7", +] } versionize = ">=0.1.6" versionize_derive = ">=0.1.3" vm-superio = ">=0.4.0" vm-allocator = "0.1.0" -derive_more = { version = "0.99.17", default-features = false, features = ["from"] } +derive_more = { version = "0.99.17", default-features = false, features = [ + "from", +] } arch = { path = "../arch" } devices = { path = "../devices" } @@ -27,7 +31,7 @@ logger = { path = "../logger" } mmds = { path = "../mmds" } rate_limiter = { path = "../rate_limiter" } seccompiler = { path = "../seccompiler" } -snapshot = { path = "../snapshot"} +snapshot = { path = "../snapshot" } utils = { path = "../utils" } virtio_gen = { path = "../virtio_gen" } vm-memory = { path = "../vm-memory" } diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index bffb43e47cf..95e7065e310 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -5,9 +5,15 @@ use std::convert::TryFrom; use std::fmt::{Display, Formatter}; +use std::fs::{File, OpenOptions}; use std::io::{self, Read, Seek, SeekFrom}; use std::os::unix::io::{AsRawFd, RawFd}; +use std::os::unix::net::UnixStream; +use std::path::Path; use std::sync::{Arc, Mutex}; +use userfaultfd::{FeatureFlags, Uffd, UffdBuilder}; +use utils::sock_ctrl_msg::ScmSocket; +use vm_memory::{FileOffset, GuestMemory}; use arch::InitrdConfig; #[cfg(target_arch = "x86_64")] @@ -28,7 +34,6 @@ use linux_loader::loader::KernelLoader; use logger::{error, warn, METRICS}; use seccompiler::BpfThreadMap; use snapshot::Persist; -use userfaultfd::Uffd; use utils::eventfd::EventFd; use utils::terminal::Terminal; use utils::time::TimestampUs; @@ -43,7 +48,7 @@ use crate::construct_kvm_mpidrs; use crate::device_manager::legacy::PortIODeviceManager; use crate::device_manager::mmio::MMIODeviceManager; use crate::device_manager::persist::MMIODevManagerConstructorArgs; -use crate::persist::{MicrovmState, MicrovmStateError}; +use crate::persist::{GuestRegionUffdMapping, MemoryDescriptor, MicrovmState, MicrovmStateError}; use crate::resources::VmResources; use crate::vmm_config::boot_source::BootConfig; use crate::vmm_config::instance_info::InstanceInfo; @@ -58,6 +63,8 @@ use crate::{device_manager, mem_size_mib, Error, EventManager, Vmm, VmmEventsObs pub enum StartMicrovmError { /// Unable to attach block device to Vmm. AttachBlockDevice(io::Error), + /// Unable to create/open the memory backing file. + BackingMemoryFile(io::Error), /// This error is thrown by the minimal boot loader implementation. ConfigureSystem(arch::Error), /// Internal errors are due to resource exhaustion. @@ -94,6 +101,20 @@ pub enum StartMicrovmError { RestoreMicrovmState(MicrovmStateError), /// Unable to set VmResources. SetVmResources(VmConfigError), + /// Failed to create an UFFD Builder. + CreateUffdBuilder(userfaultfd::Error), + /// Unable to connect to UDS in order to send information regarding + /// handling guest memory page-fault events. + UdsConnection(io::Error), + /// Failed to register guest memory regions to UFFD. + UffdMemoryRegionsRegister(userfaultfd::Error), + /// Failed to send guest memory layout and path to user fault FD used to handle + /// guest memory page faults. This information is sent to a UDS where a custom + /// page-fault handler process is listening. + UffdSend(kvm_ioctls::Error), + + /// Failed to get the memfd from the uffd socket + NoMemFdReceived, } /// It's convenient to automatically convert `linux_loader::cmdline::Error`s @@ -112,6 +133,9 @@ impl Display for StartMicrovmError { write!(f, "Unable to attach block device to Vmm: {}", err) } ConfigureSystem(err) => write!(f, "System configuration error: {:?}", err), + BackingMemoryFile(err) => { + write!(f, "Unable to create the memory backing file: {}", err) + } CreateRateLimiter(err) => write!(f, "Cannot create RateLimiter: {}", err), CreateNetDevice(err) => { let mut err_msg = format!("{:?}", err); @@ -177,6 +201,13 @@ impl Display for StartMicrovmError { } RestoreMicrovmState(err) => write!(f, "Cannot restore microvm state. Error: {}", err), SetVmResources(err) => write!(f, "Cannot set vm resources. Error: {}", err), + CreateUffdBuilder(err) => write!(f, "Cannot create uffd socket. Error: {}", err), + UdsConnection(err) => write!(f, "Cannot connect to uffd socket. Error: {}", err), + UffdMemoryRegionsRegister(err) => { + write!(f, "Cannot uffd memory region register. Error: {}", err) + } + UffdSend(err) => write!(f, "Cannot send to uffd. Error: {}", err), + NoMemFdReceived => write!(f, "No memfd received from uffd."), } } } @@ -231,7 +262,7 @@ fn create_vmm_and_vcpus( instance_info: &InstanceInfo, event_manager: &mut EventManager, guest_memory: GuestMemoryMmap, - uffd: Option, + memory_descriptor: Option, track_dirty_pages: bool, vcpu_count: u8, ) -> std::result::Result<(Vmm, Vec), StartMicrovmError> { @@ -297,7 +328,7 @@ fn create_vmm_and_vcpus( shutdown_exit_code: None, vm, guest_memory, - uffd, + memory_descriptor, vcpus_handles: Vec::new(), vcpus_exit_evt, mmio_device_manager, @@ -329,8 +360,57 @@ pub fn build_microvm_for_boot( let boot_config = vm_resources.boot_source().ok_or(MissingKernelConfig)?; let track_dirty_pages = vm_resources.track_dirty_pages(); - let guest_memory = - create_guest_memory(vm_resources.vm_config().mem_size_mib, track_dirty_pages)?; + + let (guest_memory, memory_descriptor, _file) = + if let Some(ref backend_config) = vm_resources.memory_backend { + match backend_config.backend_type { + crate::vmm_config::snapshot::MemBackendType::File => { + let file = OpenOptions::new() + .read(true) + .write(true) + .create(true) + .open(&backend_config.backend_path) + .map_err(BackingMemoryFile)?; + file.set_len((vm_resources.vm_config().mem_size_mib * 1024 * 1024) as u64) + .map_err(|e| { + error!("Failed to set backing memory file size: {}", e); + StartMicrovmError::BackingMemoryFile(e) + })?; + + let file = Arc::new(file); + + ( + create_guest_memory( + vm_resources.vm_config().mem_size_mib, + Some(file.clone()), + track_dirty_pages, + )?, + Some(MemoryDescriptor::File(file)), + None, + ) + } + crate::vmm_config::snapshot::MemBackendType::Uffd => { + let (mem, uffd, file) = create_uffd_guest_memory( + vm_resources.vm_config().mem_size_mib, + backend_config.backend_path.as_path(), + track_dirty_pages, + )?; + + (mem, Some(MemoryDescriptor::Uffd(uffd)), Some(file)) + } + } + } else { + ( + create_guest_memory( + vm_resources.vm_config().mem_size_mib, + None, + track_dirty_pages, + )?, + None, + None, + ) + }; + let vcpu_config = vm_resources.vcpu_config(); let entry_addr = load_kernel(boot_config, &guest_memory)?; let initrd = load_initrd_from_config(boot_config, &guest_memory)?; @@ -362,7 +442,7 @@ pub fn build_microvm_for_boot( instance_info, event_manager, guest_memory, - None, + memory_descriptor, track_dirty_pages, vcpu_config.vcpu_count, )?; @@ -451,7 +531,7 @@ pub fn build_microvm_from_snapshot( event_manager: &mut EventManager, microvm_state: MicrovmState, guest_memory: GuestMemoryMmap, - uffd: Option, + memory_descriptor: Option, track_dirty_pages: bool, seccomp_filters: &BpfThreadMap, vm_resources: &mut VmResources, @@ -466,7 +546,7 @@ pub fn build_microvm_from_snapshot( instance_info, event_manager, guest_memory.clone(), - uffd, + memory_descriptor, track_dirty_pages, vcpu_count, )?; @@ -581,21 +661,144 @@ pub fn build_microvm_from_snapshot( /// Creates GuestMemory of `mem_size_mib` MiB in size. pub fn create_guest_memory( mem_size_mib: usize, + backing_memory_file: Option>, track_dirty_pages: bool, ) -> std::result::Result { let mem_size = mem_size_mib << 20; let arch_mem_regions = arch::arch_memory_regions(mem_size); + let mut offset = 0_u64; vm_memory::create_guest_memory( &arch_mem_regions .iter() - .map(|(addr, size)| (None, *addr, *size)) + .map(|(addr, size)| { + let file_offset = backing_memory_file + .clone() + .map(|file| FileOffset::from_arc(file, offset)); + offset += *size as u64; + + (file_offset, *addr, *size) + }) .collect::>()[..], track_dirty_pages, ) .map_err(StartMicrovmError::GuestMemoryMmap) } +/// Creates GuestMemory of `mem_size_mib` MiB in size. +pub fn create_uffd_guest_memory( + mem_size_mib: usize, + uds_socket_path: &Path, + track_dirty_pages: bool, +) -> std::result::Result<(GuestMemoryMmap, Uffd, Arc), StartMicrovmError> { + use StartMicrovmError::{CreateUffdBuilder, NoMemFdReceived, UdsConnection, UffdSend}; + + let mut socket = UnixStream::connect(uds_socket_path).map_err(UdsConnection)?; + + let mut buf = [0u8; 8]; + let (_, memfd) = socket.recv_with_fd(&mut buf).map_err(UffdSend)?; + + if memfd.is_none() { + return Err(NoMemFdReceived); + } + + let mem_size = mem_size_mib << 20; + let arch_mem_regions = arch::arch_memory_regions(mem_size); + let backing_memory_file = Arc::new(memfd.unwrap()); + + let mut offset = 0_u64; + let guest_memory = vm_memory::create_guest_memory( + &arch_mem_regions + .iter() + .map(|(addr, size)| { + let file_offset = Some(FileOffset::from_arc(backing_memory_file.clone(), offset)); + offset += *size as u64; + + (file_offset, *addr, *size) + }) + .collect::>()[..], + track_dirty_pages, + ) + .map_err(StartMicrovmError::GuestMemoryMmap)?; + + let uffd = UffdBuilder::new() + .require_features( + FeatureFlags::EVENT_REMOVE + | FeatureFlags::EVENT_REMAP + | FeatureFlags::EVENT_FORK + | FeatureFlags::EVENT_UNMAP + | FeatureFlags::MISSING_SHMEM + | FeatureFlags::MINOR_SHMEM + | FeatureFlags::PAGEFAULT_FLAG_WP, + ) + .user_mode_only(false) + .non_blocking(true) + .create() + .map_err(CreateUffdBuilder)?; + + let mut backend_mappings = Vec::with_capacity(guest_memory.num_regions()); + let mut offset = 0; + for mem_region in guest_memory.iter() { + let host_base_addr = mem_region.as_ptr(); + let size = mem_region.size(); + + backend_mappings.push(GuestRegionUffdMapping { + base_host_virt_addr: host_base_addr as u64, + size, + offset, + }); + offset += size as u64; + } + + // This is safe to unwrap() because we control the contents of the vector + // (i.e GuestRegionUffdMapping entries). + let backend_mappings = serde_json::to_string(&backend_mappings).unwrap(); + + socket + .send_with_fd( + backend_mappings.as_bytes(), + // In the happy case we can close the fd since the other process has it open and is + // using it to serve us pages. + // + // The problem is that if other process crashes/exits, firecracker guest memory + // will simply revert to anon-mem behavior which would lead to silent errors and + // undefined behavior. + // + // To tackle this scenario, the page fault handler can notify Firecracker of any + // crashes/exits. There is no need for Firecracker to explicitly send its process ID. + // The external process can obtain Firecracker's PID by calling `getsockopt` with + // `libc::SO_PEERCRED` option like so: + // + // let mut val = libc::ucred { pid: 0, gid: 0, uid: 0 }; + // let mut ucred_size: u32 = mem::size_of::() as u32; + // libc::getsockopt( + // socket.as_raw_fd(), + // libc::SOL_SOCKET, + // libc::SO_PEERCRED, + // &mut val as *mut _ as *mut _, + // &mut ucred_size as *mut libc::socklen_t, + // ); + // + // Per this linux man page: https://man7.org/linux/man-pages/man7/unix.7.html, + // `SO_PEERCRED` returns the credentials (PID, UID and GID) of the peer process + // connected to this socket. The returned credentials are those that were in effect + // at the time of the `connect` call. + // + // Moreover, Firecracker holds a copy of the UFFD fd as well, so that even if the + // page fault handler process does not tear down Firecracker when necessary, the + // uffd will still be alive but with no one to serve faults, leading to guest freeze. + uffd.as_raw_fd(), + ) + .map_err(UffdSend)?; + + // Wait for UFFD to be ready. + // TODO: maybe add a timeout? + let mut buf = [0; 2]; + socket.read_exact(&mut buf).map_err(UdsConnection)?; + + Ok((guest_memory, uffd, backing_memory_file)) +} + fn load_kernel( boot_config: &BootConfig, guest_memory: &GuestMemoryMmap, @@ -694,7 +897,7 @@ pub(crate) fn setup_kvm_vm( .map_err(Error::KvmContext) .map_err(Internal)?; let mut vm = Vm::new(kvm.fd()).map_err(Error::Vm).map_err(Internal)?; - vm.memory_init(&guest_memory, kvm.max_memslots(), track_dirty_pages) + vm.memory_init(guest_memory, kvm.max_memslots(), track_dirty_pages) .map_err(Error::Vm) .map_err(Internal)?; Ok(vm) @@ -1068,7 +1271,7 @@ pub mod tests { } pub(crate) fn default_vmm() -> Vmm { - let guest_memory = create_guest_memory(128, false).unwrap(); + let guest_memory = create_guest_memory(128, None, false).unwrap(); let vcpus_exit_evt = EventFd::new(libc::EFD_NONBLOCK) .map_err(Error::EventFd) @@ -1096,12 +1299,12 @@ pub mod tests { shutdown_exit_code: None, vm, guest_memory, - uffd: None, vcpus_handles: Vec::new(), vcpus_exit_evt, mmio_device_manager, #[cfg(target_arch = "x86_64")] pio_device_manager, + memory_descriptor: None, } } @@ -1283,13 +1486,13 @@ pub mod tests { // Case 1: create guest memory without dirty page tracking { - let guest_memory = create_guest_memory(mem_size, false).unwrap(); + let guest_memory = create_guest_memory(mem_size, None, false).unwrap(); assert!(!is_dirty_tracking_enabled(&guest_memory)); } // Case 2: create guest memory with dirty page tracking { - let guest_memory = create_guest_memory(mem_size, true).unwrap(); + let guest_memory = create_guest_memory(mem_size, None, true).unwrap(); assert!(is_dirty_tracking_enabled(&guest_memory)); } } @@ -1297,7 +1500,7 @@ pub mod tests { #[test] fn test_create_vcpus() { let vcpu_count = 2; - let guest_memory = create_guest_memory(128, false).unwrap(); + let guest_memory = create_guest_memory(128, None, false).unwrap(); #[allow(unused_mut)] let mut vm = setup_kvm_vm(&guest_memory, false).unwrap(); diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 26d1bd8e232..9149dd4bd5c 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -19,8 +19,8 @@ use devices::legacy::RTCDevice; use devices::legacy::SerialDevice; use devices::pseudo::BootTimer; use devices::virtio::{ - Balloon, Block, MmioTransport, Net, VirtioDevice, TYPE_BALLOON, TYPE_BLOCK, TYPE_NET, - TYPE_VSOCK, + Balloon, Block, MmioTransport, Net, VirtioDevice, Vsock, VsockUnixBackend, TYPE_BALLOON, + TYPE_BLOCK, TYPE_NET, TYPE_VSOCK, }; use devices::BusDevice; use kvm_ioctls::{IoEventAddress, VmFd}; @@ -437,6 +437,16 @@ impl MMIODeviceManager { // so for Vsock we don't support connection persistence through snapshot. // Any in-flight packets or events are simply lost. // Vsock is restored 'empty'. + // The only reason we still `kick` it is to make guest process + // `TRANSPORT_RESET_EVENT` event we sent during snapshot creation. + let vsock = virtio + .as_mut_any() + .downcast_mut::>() + .unwrap(); + if vsock.is_activated() { + info!("kick vsock."); + vsock.signal_used_queue().unwrap(); + } } _ => (), } diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 56a093a690a..5250a08d281 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -293,11 +293,6 @@ impl<'a> Persist<'a> for MMIODeviceManager { .downcast_mut::>() .unwrap(); - let vsock_state = VsockState { - backend: vsock.backend().save(), - frontend: vsock.save(), - }; - // Send Transport event to reset connections if device // is activated. if vsock.is_activated() { @@ -306,6 +301,13 @@ impl<'a> Persist<'a> for MMIODeviceManager { }); } + // Save state after potential notification to the guest. This + // way we save changes to the queue the notification can cause. + let vsock_state = VsockState { + backend: vsock.backend().save(), + frontend: vsock.save(), + }; + states.vsock_device = Some(ConnectedVsockState { device_id: devid.clone(), device_state: vsock_state, diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index d71932525b5..90975ac6acc 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -50,10 +50,10 @@ use devices::virtio::{ use devices::BusDevice; use event_manager::{EventManager as BaseEventManager, EventOps, Events, MutEventSubscriber}; use logger::{error, info, warn, LoggerError, MetricsError, METRICS}; +use persist::MemoryDescriptor; use rate_limiter::BucketUpdate; use seccompiler::BpfProgram; use snapshot::Persist; -use userfaultfd::Uffd; use utils::epoll::EventSet; use utils::eventfd::EventFd; use vm_memory::{GuestMemory, GuestMemoryMmap, GuestMemoryRegion}; @@ -260,10 +260,6 @@ pub struct Vmm { // Guest VM core resources. vm: Vm, guest_memory: GuestMemoryMmap, - // Save UFFD in order to keep it open in the Firecracker process, as well. - // Since this field is never read again, we need to allow `dead_code`. - #[allow(dead_code)] - uffd: Option, vcpus_handles: Vec, // Used by Vcpus and devices to initiate teardown; Vmm should never write here. vcpus_exit_evt: EventFd, @@ -272,6 +268,11 @@ pub struct Vmm { mmio_device_manager: MMIODeviceManager, #[cfg(target_arch = "x86_64")] pio_device_manager: PortIODeviceManager, + + // The mem file that should be mmaped. We need to keep a reference of the UFFD in the + // process so we allow dead_code + #[allow(dead_code)] + memory_descriptor: Option, } impl Vmm { diff --git a/src/vmm/src/memory_snapshot.rs b/src/vmm/src/memory_snapshot.rs index 9eebe371da0..8e5293bcfcd 100644 --- a/src/vmm/src/memory_snapshot.rs +++ b/src/vmm/src/memory_snapshot.rs @@ -6,7 +6,9 @@ use std::fmt::{Display, Formatter}; use std::fs::File; use std::io::SeekFrom; +use std::time::Instant; +use libc::{MAP_SHARED, PROT_WRITE}; use utils::{errno, get_page_size}; use versionize::{VersionMap, Versionize, VersionizeResult}; use versionize_derive::Versionize; @@ -126,7 +128,11 @@ impl SnapshotMemory for GuestMemoryMmap { let mut writer_offset = 0; let page_size = get_page_size()?; - self.iter() + let start = Instant::now(); + let mut total_written = 0; + + let res = self + .iter() .enumerate() .try_for_each(|(slot, region)| { let kvm_bitmap = dirty_bitmap.get(&slot).unwrap(); @@ -150,23 +156,37 @@ impl SnapshotMemory for GuestMemoryMmap { } write_size += page_size; } else if write_size > 0 { + let start = Instant::now(); // We are at the end of a batch of dirty pages. region.write_all_to( MemoryRegionAddress(dirty_batch_start), writer, write_size, )?; + eprintln!( + "writing {}B took {}ms", + write_size, + start.elapsed().as_millis() + ); + total_written += write_size; write_size = 0; } } } if write_size > 0 { + let start = Instant::now(); region.write_all_to( MemoryRegionAddress(dirty_batch_start), writer, write_size, )?; + total_written += write_size; + eprintln!( + "writing {}B took {}ms", + write_size, + start.elapsed().as_millis() + ); } writer_offset += region.len(); if let Some(bitmap) = firecracker_bitmap { @@ -175,7 +195,15 @@ impl SnapshotMemory for GuestMemoryMmap { Ok(()) }) - .map_err(Error::WriteMemory) + .map_err(Error::WriteMemory); + + eprintln!( + "total write time: {}ms, total written: {}B", + start.elapsed().as_millis(), + total_written + ); + + res } /// Creates a GuestMemoryMmap backed by a `file` if present, otherwise backed @@ -199,6 +227,117 @@ impl SnapshotMemory for GuestMemoryMmap { } } +/// Dumps all pages of GuestMemoryMmap present in `dirty_bitmap` to a writer. +pub fn mem_dump_dirty( + mem_map: &GuestMemoryMmap, + fd: i32, + len: usize, + dirty_bitmap: &DirtyBitmap, +) -> std::result::Result<(), Error> { + let mut writer_offset = 0_u64; + let page_size = get_page_size()?; + + let start = Instant::now(); + let mut total_written = 0; + + let source_map = + unsafe { libc::mmap(std::ptr::null_mut(), len, PROT_WRITE, MAP_SHARED, fd, 0) }; + + let res = mem_map + .iter() + .enumerate() + .try_for_each(|(slot, region)| { + let kvm_bitmap = dirty_bitmap.get(&slot).unwrap(); + let firecracker_bitmap = region.bitmap(); + let mut write_size = 0; + let mut dirty_batch_start: u64 = 0; + + let mmap_base = region.get_host_address(MemoryRegionAddress(0)).unwrap(); + for (i, v) in kvm_bitmap.iter().enumerate() { + for j in 0..64 { + let is_kvm_page_dirty = ((v >> j) & 1u64) != 0u64; + let page_offset = ((i * 64) + j) * page_size; + let is_firecracker_page_dirty = firecracker_bitmap.dirty_at(page_offset); + if is_kvm_page_dirty || is_firecracker_page_dirty { + // We are at the start of a new batch of dirty pages. + if write_size == 0 { + // Seek forward over the unmodified pages. + dirty_batch_start = page_offset as u64; + } + write_size += page_size; + } else if write_size > 0 { + let start = Instant::now(); + + eprintln!( + "starting write of {}B (source {}, dest {})", + write_size, + dirty_batch_start, + writer_offset + dirty_batch_start + ); + unsafe { + std::ptr::copy_nonoverlapping( + mmap_base.offset((dirty_batch_start) as isize), + source_map.offset((writer_offset + dirty_batch_start) as isize) + as *mut u8, + write_size, + ); + } + + eprintln!( + "writing {}B took {}ms", + write_size, + start.elapsed().as_millis() + ); + total_written += write_size; + write_size = 0; + } + } + } + + if write_size > 0 { + let start = Instant::now(); + + eprintln!( + "starting final write of {}B (source {}, dest {}) (total_size: {})", + write_size, + dirty_batch_start, + writer_offset + dirty_batch_start, + len + ); + unsafe { + std::ptr::copy_nonoverlapping( + mmap_base.offset((dirty_batch_start) as isize), + source_map.offset((writer_offset + dirty_batch_start) as isize) as *mut u8, + write_size, + ); + } + total_written += write_size; + eprintln!( + "writing {}B took {}ms", + write_size, + start.elapsed().as_millis() + ); + } + writer_offset += region.len(); + if let Some(bitmap) = firecracker_bitmap { + bitmap.reset(); + } + + Ok(()) + }) + .map_err(Error::WriteMemory); + + eprintln!( + "total write time: {}ms, total written: {}B", + start.elapsed().as_millis(), + total_written + ); + + eprintln!("memfd {}, len {}", fd, len); + + res +} + #[cfg(test)] mod tests { use std::collections::HashMap; diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index 309cbce46b3..c417973ac10 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -3,11 +3,13 @@ //! Defines state structures for saving/restoring a Firecracker microVM. +use std::ffi::CString; use std::fmt::{Display, Formatter}; use std::fs::{File, OpenOptions}; -use std::io::{self, Write}; +use std::io::{self, Read, Write}; use std::os::unix::io::AsRawFd; use std::os::unix::net::UnixStream; +use std::os::unix::prelude::FromRawFd; use std::path::Path; use std::sync::{Arc, Mutex}; @@ -16,6 +18,7 @@ use arch::regs::{get_manufacturer_id_from_host, get_manufacturer_id_from_state}; #[cfg(target_arch = "x86_64")] use cpuid::common::{get_vendor_id_from_cpuid, get_vendor_id_from_host}; use devices::virtio::TYPE_NET; +use libc::memfd_create; use logger::{error, info}; use seccompiler::BpfThreadMap; use serde::Serialize; @@ -29,7 +32,7 @@ use vm_memory::{GuestMemory, GuestMemoryMmap}; use crate::builder::{self, StartMicrovmError}; use crate::device_manager::persist::{DeviceStates, Error as DevicePersistError}; -use crate::memory_snapshot::{GuestMemoryState, SnapshotMemory}; +use crate::memory_snapshot::{mem_dump_dirty, GuestMemoryState, SnapshotMemory}; use crate::resources::VmResources; #[cfg(target_arch = "x86_64")] use crate::version_map::FC_V0_23_SNAP_VERSION; @@ -224,6 +227,8 @@ pub enum LoadSnapshotError { /// Unable to connect to UDS in order to send information regarding /// handling guest memory page-fault events. UdsConnection(io::Error), + /// We didn't get the memfd when handshaking with the uffd manager + NoMemFdReceived, /// Failed to register guest memory regions to UFFD. UffdMemoryRegionsRegister(userfaultfd::Error), /// Failed to send guest memory layout and path to user fault FD used to handle @@ -245,6 +250,7 @@ impl Display for LoadSnapshotError { } InvalidSnapshot(err) => write!(f, "Snapshot sanity check failed: {}", err), MemoryBackingFile(err) => write!(f, "Cannot open the memory file: {}", err), + NoMemFdReceived => write!(f, "No memory file descriptor received"), ResumeMicroVm(err) => write!( f, "Failed to resume microVM after loading snapshot: {}", @@ -276,7 +282,7 @@ pub fn create_snapshot( version_map: VersionMap, ) -> std::result::Result<(), CreateSnapshotError> { // Fail early from invalid target version. - let snapshot_data_version = get_snapshot_data_version(¶ms.version, &version_map, &vmm)?; + let snapshot_data_version = get_snapshot_data_version(¶ms.version, &version_map, vmm)?; let microvm_state = vmm .save_state() @@ -289,7 +295,9 @@ pub fn create_snapshot( version_map, )?; - snapshot_memory_to_file(vmm, ¶ms.mem_file_path, ¶ms.snapshot_type)?; + if params.snapshot_type == SnapshotType::Full { + snapshot_memory_to_file(vmm, ¶ms.mem_file_path, ¶ms.snapshot_type)?; + } Ok(()) } @@ -311,12 +319,14 @@ fn snapshot_state_to_file( snapshot .save(&mut snapshot_file, microvm_state) .map_err(SerializeMicrovmState)?; - snapshot_file - .flush() - .map_err(|err| SnapshotBackingFile("flush", err))?; - snapshot_file - .sync_all() - .map_err(|err| SnapshotBackingFile("sync_all", err)) + // Disable the following lines as we're seeing some performance issues with btrfs on these operations + // snapshot_file + // .flush() + // .map_err(|err| SnapshotBackingFile("flush", err))?; + // snapshot_file + // .sync_all() + // .map_err(|err| SnapshotBackingFile("sync_all", err)) + Ok(()) } fn snapshot_memory_to_file( @@ -325,31 +335,51 @@ fn snapshot_memory_to_file( snapshot_type: &SnapshotType, ) -> std::result::Result<(), CreateSnapshotError> { use self::CreateSnapshotError::*; - let mut file = OpenOptions::new() - .write(true) - .create(true) - .truncate(true) - .open(mem_file_path) - .map_err(|err| MemoryBackingFile("open", err))?; + + let mut file = if mem_file_path.to_string_lossy() == "memfd" { + let fd = unsafe { + let memfd_name = CString::new("diff").unwrap(); + memfd_create(memfd_name.as_ptr(), 0) + }; + if fd == -1 { + return Err(MemoryBackingFile( + "memfd_create", + std::io::Error::last_os_error(), + )); + } + + unsafe { File::from_raw_fd(fd) } + } else { + OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open(mem_file_path) + .map_err(|err| MemoryBackingFile("open", err))? + }; // Set the length of the file to the full size of the memory area. let mem_size_mib = mem_size_mib(vmm.guest_memory()); + // Set the length of the file to the full size of the memory area. file.set_len((mem_size_mib * 1024 * 1024) as u64) .map_err(|err| MemoryBackingFile("set_length", err))?; match snapshot_type { SnapshotType::Diff => { let dirty_bitmap = vmm.get_dirty_bitmap().map_err(DirtyBitmap)?; - vmm.guest_memory() - .dump_dirty(&mut file, &dirty_bitmap) - .map_err(Memory) + + mem_dump_dirty( + vmm.guest_memory(), + file.as_raw_fd(), + (mem_size_mib * 1024 * 1024) as usize, + &dirty_bitmap, + ) + .map_err(Memory) } SnapshotType::Full => vmm.guest_memory().dump(&mut file).map_err(Memory), }?; - file.flush() - .map_err(|err| MemoryBackingFile("flush", err))?; - file.sync_all() - .map_err(|err| MemoryBackingFile("sync_all", err)) + + Ok(()) } /// Validate the microVM version and translate it to its corresponding snapshot data format. @@ -483,6 +513,16 @@ pub fn snapshot_state_sanity_check( Ok(()) } +/// Describes a descriptor that connects to the memory used by the VM. This could either be the a file descriptor +/// or a UFFD descriptor. +#[derive(Debug)] +pub enum MemoryDescriptor { + /// A file descriptor that connects to the user fault process. + Uffd(Uffd), + /// A file descriptor of the backing memory file. + File(Arc), +} + /// Loads a Microvm snapshot producing a 'paused' Microvm. pub fn restore_from_snapshot( instance_info: &InstanceInfo, @@ -501,26 +541,25 @@ pub fn restore_from_snapshot( let mem_backend_path = ¶ms.mem_backend.backend_path; let mem_state = µvm_state.memory_state; let track_dirty_pages = params.enable_diff_snapshots; - let (guest_memory, uffd) = match params.mem_backend.backend_type { - MemBackendType::File => ( - guest_memory_from_file(mem_backend_path, mem_state, track_dirty_pages)?, - None, - ), - MemBackendType::Uffd => guest_memory_from_uffd( - mem_backend_path, - mem_state, - track_dirty_pages, - // We enable the UFFD_FEATURE_EVENT_REMOVE feature only if a balloon device - // is present in the microVM state. - microvm_state.device_states.balloon_device.is_some(), - )?, + let (guest_memory, memory_descriptor) = match params.mem_backend.backend_type { + MemBackendType::File => { + let (guest_memory, file) = + guest_memory_from_file(mem_backend_path, mem_state, track_dirty_pages)?; + (guest_memory, Some(MemoryDescriptor::File(Arc::new(file)))) + } + MemBackendType::Uffd => { + let (guest_memory, uffd) = + guest_memory_from_uffd(mem_backend_path, mem_state, track_dirty_pages)?; + + (guest_memory, uffd.map(MemoryDescriptor::Uffd)) + } }; builder::build_microvm_from_snapshot( instance_info, event_manager, microvm_state, guest_memory, - uffd, + memory_descriptor, track_dirty_pages, seccomp_filters, vm_resources, @@ -545,36 +584,53 @@ fn guest_memory_from_file( mem_file_path: &Path, mem_state: &GuestMemoryState, track_dirty_pages: bool, -) -> std::result::Result { +) -> std::result::Result<(GuestMemoryMmap, File), LoadSnapshotError> { use self::LoadSnapshotError::{DeserializeMemory, MemoryBackingFile}; - let mem_file = File::open(mem_file_path).map_err(MemoryBackingFile)?; - GuestMemoryMmap::restore(Some(&mem_file), mem_state, track_dirty_pages) - .map_err(DeserializeMemory) + let mem_file = OpenOptions::new() + .write(true) + .read(true) + .open(mem_file_path) + .map_err(MemoryBackingFile)?; + + Ok(( + GuestMemoryMmap::restore(Some(&mem_file), mem_state, track_dirty_pages) + .map_err(DeserializeMemory)?, + mem_file, + )) } -fn guest_memory_from_uffd( +pub(crate) fn guest_memory_from_uffd( mem_uds_path: &Path, mem_state: &GuestMemoryState, track_dirty_pages: bool, - enable_balloon: bool, ) -> std::result::Result<(GuestMemoryMmap, Option), LoadSnapshotError> { - use self::LoadSnapshotError::{ - CreateUffdBuilder, DeserializeMemory, UdsConnection, UffdMemoryRegionsRegister, UffdSend, - }; + use self::LoadSnapshotError::{CreateUffdBuilder, DeserializeMemory, UdsConnection, UffdSend}; - let guest_memory = - GuestMemoryMmap::restore(None, mem_state, track_dirty_pages).map_err(DeserializeMemory)?; + let mut socket = UnixStream::connect(mem_uds_path).map_err(UdsConnection)?; - let mut uffd_builder = UffdBuilder::new(); + let mut buf = [0u8; 8]; + let (_, memfd) = socket.recv_with_fd(&mut buf).map_err(UffdSend)?; - if enable_balloon { - // We enable this so that the page fault handler can add logic - // for treating madvise(MADV_DONTNEED) events triggerd by balloon inflation. - uffd_builder.require_features(FeatureFlags::EVENT_REMOVE); + if memfd.is_none() { + return Err(LoadSnapshotError::NoMemFdReceived); } - let uffd = uffd_builder - .close_on_exec(true) + let memfd = memfd.unwrap(); + + let guest_memory = GuestMemoryMmap::restore(Some(&memfd), mem_state, track_dirty_pages) + .map_err(DeserializeMemory)?; + + let uffd = UffdBuilder::new() + .require_features( + FeatureFlags::EVENT_REMOVE + | FeatureFlags::EVENT_REMAP + | FeatureFlags::EVENT_FORK + | FeatureFlags::EVENT_UNMAP + | FeatureFlags::MISSING_SHMEM + | FeatureFlags::MINOR_SHMEM + | FeatureFlags::PAGEFAULT_FLAG_WP, + ) + .user_mode_only(false) .non_blocking(true) .create() .map_err(CreateUffdBuilder)?; @@ -584,8 +640,6 @@ fn guest_memory_from_uffd( let host_base_addr = mem_region.as_ptr(); let size = mem_region.size(); - uffd.register(host_base_addr as _, size as _) - .map_err(UffdMemoryRegionsRegister)?; backend_mappings.push(GuestRegionUffdMapping { base_host_virt_addr: host_base_addr as u64, size, @@ -597,7 +651,6 @@ fn guest_memory_from_uffd( // (i.e GuestRegionUffdMapping entries). let backend_mappings = serde_json::to_string(&backend_mappings).unwrap(); - let socket = UnixStream::connect(mem_uds_path).map_err(UdsConnection)?; socket .send_with_fd( backend_mappings.as_bytes(), @@ -635,6 +688,11 @@ fn guest_memory_from_uffd( ) .map_err(UffdSend)?; + // Wait for UFFD to be ready. + // TODO: maybe add a timeout? + let mut buf = [0; 2]; + socket.read_exact(&mut buf).map_err(UdsConnection)?; + Ok((guest_memory, Some(uffd))) } diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index 9a5257ac307..94c4c7d05fa 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -20,6 +20,7 @@ use crate::vmm_config::machine_config::{VmConfig, VmConfigError, VmUpdateConfig} use crate::vmm_config::metrics::{init_metrics, MetricsConfig, MetricsConfigError}; use crate::vmm_config::mmds::{MmdsConfig, MmdsConfigError}; use crate::vmm_config::net::*; +use crate::vmm_config::snapshot::MemBackendConfig; use crate::vmm_config::vsock::*; use crate::vstate::vcpu::VcpuConfig; @@ -117,6 +118,8 @@ pub struct VmResources { pub mmds_size_limit: usize, /// Whether or not to load boot timer device. pub boot_timer: bool, + /// When backed by a memory on boot, this should be set + pub memory_backend: Option, } impl VmResources { @@ -236,6 +239,16 @@ impl VmResources { self.vm_config.track_dirty_pages = dirty_page_tracking; } + /// Returns the config for the backing memory file + pub fn memory_backend(&self) -> Option { + self.memory_backend.clone() + } + + /// Sets the backing memory file + pub fn set_memory_backend(&mut self, backing_mem_file: MemBackendConfig) { + self.memory_backend.get_or_insert(backing_mem_file); + } + /// Returns the VmConfig. pub fn vm_config(&self) -> &VmConfig { &self.vm_config @@ -575,6 +588,7 @@ mod tests { mmds: None, boot_timer: false, mmds_size_limit: HTTP_MAX_PAYLOAD_SIZE, + memory_backend: None, } } diff --git a/src/vmm/src/rpc_interface.rs b/src/vmm/src/rpc_interface.rs index 2a79237d3f1..4c263c048b7 100644 --- a/src/vmm/src/rpc_interface.rs +++ b/src/vmm/src/rpc_interface.rs @@ -39,7 +39,9 @@ use crate::vmm_config::mmds::{MmdsConfig, MmdsConfigError}; use crate::vmm_config::net::{ NetworkInterfaceConfig, NetworkInterfaceError, NetworkInterfaceUpdateConfig, }; -use crate::vmm_config::snapshot::{CreateSnapshotParams, LoadSnapshotParams, SnapshotType}; +use crate::vmm_config::snapshot::{ + CreateSnapshotParams, LoadSnapshotParams, MemBackendConfig, SnapshotType, +}; use crate::vmm_config::vsock::{VsockConfigError, VsockDeviceConfig}; use crate::vmm_config::{self, RateLimiterUpdate}; use crate::{EventManager, FcExitCode}; @@ -99,6 +101,9 @@ pub enum VmmAction { /// `BalloonDeviceConfig` as input. This action can only be called before the microVM /// has booted. SetBalloonDevice(BalloonDeviceConfig), + /// Set the memory backend for the VM. The VM will use this backend to handle its + /// memory. This action can only be called before the microVM has booted. + SetMemoryBackend(MemBackendConfig), /// Set the MMDS configuration. SetMmdsConfiguration(MmdsConfig), /// Set the vsock device or update the one that already exists using the @@ -422,6 +427,7 @@ impl<'a> PrebootApiController<'a> { SetBalloonDevice(config) => self.set_balloon_device(config), SetVsockDevice(config) => self.set_vsock_device(config), SetMmdsConfiguration(config) => self.set_mmds_config(config), + SetMemoryBackend(config) => self.set_memory_backend(config), StartMicroVm => self.start_microvm(), UpdateVmConfiguration(config) => self.update_vm_config(config), // Operations not allowed pre-boot. @@ -447,6 +453,13 @@ impl<'a> PrebootApiController<'a> { .map_err(VmmActionError::BalloonConfig) } + fn set_memory_backend(&mut self, cfg: MemBackendConfig) -> ActionResult { + self.boot_path = true; + self.vm_resources.memory_backend = Some(cfg); + + Ok(VmmData::Empty) + } + fn insert_block_device(&mut self, cfg: BlockDeviceConfig) -> ActionResult { self.boot_path = true; self.vm_resources @@ -654,6 +667,7 @@ impl RuntimeApiController { | InsertNetworkDevice(_) | LoadSnapshot(_) | SetBalloonDevice(_) + | SetMemoryBackend(_) | SetVsockDevice(_) | SetMmdsConfiguration(_) | StartMicroVm @@ -720,14 +734,14 @@ impl RuntimeApiController { fn create_snapshot(&mut self, create_params: &CreateSnapshotParams) -> ActionResult { log_dev_preview_warning("Virtual machine snapshots", None); - if create_params.snapshot_type == SnapshotType::Diff - && !self.vm_resources.track_dirty_pages() - { - return Err(VmmActionError::NotSupported( - "Diff snapshots are not allowed on uVMs with dirty page tracking disabled." - .to_string(), - )); - } + // if create_params.snapshot_type == SnapshotType::Diff + // && !self.vm_resources.track_dirty_pages() + // { + // return Err(VmmActionError::NotSupported( + // "Diff snapshots are not allowed on uVMs with dirty page tracking disabled." + // .to_string(), + // )); + // } let mut locked_vmm = self.vmm.lock().unwrap(); let create_start_us = utils::time::get_time_us(utils::time::ClockType::Monotonic); @@ -862,6 +876,7 @@ mod tests { pub boot_timer: bool, // when `true`, all self methods are forced to fail pub force_errors: bool, + pub memory_backend: Option, } impl MockVmRes { diff --git a/src/vmm/src/vmm_config/snapshot.rs b/src/vmm/src/vmm_config/snapshot.rs index c8634b66980..d06a92f7d3a 100644 --- a/src/vmm/src/vmm_config/snapshot.rs +++ b/src/vmm/src/vmm_config/snapshot.rs @@ -28,7 +28,7 @@ impl Default for SnapshotType { /// 1) A file that contains the guest memory to be loaded, /// 2) An UDS where a custom page-fault handler process is listening for /// the UFFD set up by Firecracker to handle its guest memory page faults. -#[derive(Debug, Deserialize, PartialEq)] +#[derive(Debug, Clone, Deserialize, PartialEq)] pub enum MemBackendType { /// Guest memory contents will be loaded from a file. File, @@ -91,7 +91,7 @@ pub struct LoadSnapshotConfig { } /// Stores the configuration used for managing snapshot memory. -#[derive(Debug, Deserialize, PartialEq)] +#[derive(Debug, Clone, Deserialize, PartialEq)] #[serde(deny_unknown_fields)] pub struct MemBackendConfig { /// Path to the backend used to handle the guest memory. diff --git a/tests/host_tools/uffd/Cargo.toml b/tests/host_tools/uffd/Cargo.toml index f83ed0f9c3f..725da59e84e 100644 --- a/tests/host_tools/uffd/Cargo.toml +++ b/tests/host_tools/uffd/Cargo.toml @@ -11,7 +11,7 @@ libc = ">=0.2.39" nix = "0.23.0" serde = { version = ">=1.0.27", features = ["derive"] } serde_json = ">=1.0.9" -userfaultfd = ">=0.4.0" +userfaultfd = ">=0.5.0" [workspace] diff --git a/tests/integration_tests/functional/test_mmds.py b/tests/integration_tests/functional/test_mmds.py index 870bce0a70a..92f735436df 100644 --- a/tests/integration_tests/functional/test_mmds.py +++ b/tests/integration_tests/functional/test_mmds.py @@ -832,7 +832,7 @@ def test_mmds_v2_negative(test_microvm_with_api, network_config): cmd = f"curl -m 2 -s -X PUT http://{DEFAULT_IPV4}/latest/api/token" expected = ( "Token time to live value not found. Use " - "`X-metadata-token-ttl_seconds` header to specify " + "`X-metadata-token-ttl-seconds` header to specify " "the token's lifetime." ) _run_guest_cmd(ssh_connection, cmd, expected) diff --git a/tools/devtool b/tools/devtool index 6d5bc487cc2..fcca289378f 100755 --- a/tools/devtool +++ b/tools/devtool @@ -615,6 +615,7 @@ run_devctr() { --rm \ --volume /dev:/dev \ --volume "$FC_ROOT_DIR:$CTR_FC_ROOT_DIR:z" \ + --mount type=bind,source=/usr/include/linux/userfaultfd.h,target=/usr/include/linux/userfaultfd.h \ --env OPT_LOCAL_IMAGES_PATH="$(dirname "$CTR_MICROVM_IMAGES_DIR")" \ --env PYTHONDONTWRITEBYTECODE=1 \ "$DEVCTR_IMAGE" "${ctr_args[@]}"