Skip to content

Commit b825c74

Browse files
committed
feat(pvtime): Add PVTime device support for ARM
- Added PVTime struct and logic to allocate and register per-vCPU steal_time regions - Integrated PVTime device initialization during microVM boot - Fixed kvm_set_device_attr failure by aligning each steal_time region to 64 bytes - Added snapshot/restore support via Persist interface - Refactored PVTime into its own module Signed-off-by: Dakshin Devanand <[email protected]>
1 parent f6fa622 commit b825c74

File tree

5 files changed

+252
-2
lines changed

5 files changed

+252
-2
lines changed

src/vmm/src/arch/aarch64/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ pub mod gic;
99
pub mod kvm;
1010
/// Layout for this aarch64 system.
1111
pub mod layout;
12+
/// Module for the paravirtualized time device
13+
pub mod pvtime;
1214
/// Logic for configuring aarch64 registers.
1315
pub mod regs;
1416
/// Architecture specific vCPU code

src/vmm/src/arch/aarch64/pvtime.rs

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
// SPDX-License-Identifier: Apache-2.0
3+
use displaydoc::Display;
4+
use kvm_bindings::{KVM_ARM_VCPU_PVTIME_CTRL, KVM_ARM_VCPU_PVTIME_IPA};
5+
use kvm_ioctls::VcpuFd;
6+
use serde::{Deserialize, Serialize};
7+
use thiserror::Error;
8+
use vm_memory::GuestAddress;
9+
10+
use crate::Vcpu;
11+
use crate::device_manager::resources::ResourceAllocator;
12+
use crate::snapshot::Persist;
13+
14+
/// 64 bytes due to alignment requirement in 3.1 of https://www.kernel.org/doc/html/v5.8/virt/kvm/devices/vcpu.html#attribute-kvm-arm-vcpu-pvtime-ipa
15+
pub const STEALTIME_STRUCT_MEM_SIZE: u64 = 64;
16+
17+
/// Represent PVTime device for ARM
18+
#[derive(Debug)]
19+
pub struct PVTime {
20+
/// Number of vCPUs
21+
vcpu_count: u8,
22+
/// The base IPA of the shared memory region
23+
base_ipa: GuestAddress,
24+
}
25+
26+
/// Errors associated with PVTime operations
27+
#[derive(Debug, Error, Display, PartialEq, Eq)]
28+
pub enum PVTimeError {
29+
/// Failed to allocate memory region: {0}
30+
AllocationFailed(vm_allocator::Error),
31+
/// Invalid VCPU ID: {0}
32+
InvalidVcpuIndex(u8),
33+
/// Error while setting or getting device attributes for vCPU: {0}
34+
DeviceAttribute(kvm_ioctls::Error),
35+
}
36+
37+
impl PVTime {
38+
/// Helper function to get the IPA of the steal_time region for a given vCPU
39+
fn get_steal_time_region_addr(&self, vcpu_index: u8) -> Result<GuestAddress, PVTimeError> {
40+
if vcpu_index >= self.vcpu_count {
41+
return Err(PVTimeError::InvalidVcpuIndex(vcpu_index));
42+
}
43+
Ok(GuestAddress(
44+
self.base_ipa.0 + (vcpu_index as u64 * STEALTIME_STRUCT_MEM_SIZE),
45+
))
46+
}
47+
48+
/// Create a new PVTime device given a base addr
49+
/// - Assumes total shared memory region from base addr is already allocated
50+
fn from_base(base_ipa: GuestAddress, vcpu_count: u8) -> Self {
51+
PVTime {
52+
vcpu_count,
53+
base_ipa,
54+
}
55+
}
56+
57+
/// Creates a new PVTime device by allocating new system memory for all vCPUs
58+
pub fn new(
59+
resource_allocator: &mut ResourceAllocator,
60+
vcpu_count: u8,
61+
) -> Result<Self, PVTimeError> {
62+
// This returns the IPA of the start of our shared memory region for all vCPUs.
63+
let base_ipa: GuestAddress = GuestAddress(
64+
resource_allocator
65+
.allocate_system_memory(
66+
STEALTIME_STRUCT_MEM_SIZE * vcpu_count as u64,
67+
STEALTIME_STRUCT_MEM_SIZE,
68+
vm_allocator::AllocPolicy::LastMatch,
69+
)
70+
.map_err(PVTimeError::AllocationFailed)?,
71+
);
72+
Ok(Self::from_base(base_ipa, vcpu_count))
73+
}
74+
75+
/// Check if PVTime is supported on vcpu
76+
pub fn is_supported(vcpu_fd: &VcpuFd) -> bool {
77+
// Check if pvtime is enabled
78+
let pvtime_device_attr = kvm_bindings::kvm_device_attr {
79+
group: kvm_bindings::KVM_ARM_VCPU_PVTIME_CTRL,
80+
attr: kvm_bindings::KVM_ARM_VCPU_PVTIME_IPA as u64,
81+
addr: 0,
82+
flags: 0,
83+
};
84+
85+
// Use kvm_has_device_attr to check if PVTime is supported
86+
vcpu_fd.has_device_attr(&pvtime_device_attr).is_ok()
87+
}
88+
89+
/// Register a vCPU with its pre-allocated steal time region
90+
fn register_vcpu(
91+
&self,
92+
vcpu_index: u8,
93+
vcpu_fd: &kvm_ioctls::VcpuFd,
94+
) -> Result<(), PVTimeError> {
95+
// Get IPA of the steal_time region for this vCPU
96+
let ipa = self.get_steal_time_region_addr(vcpu_index)?;
97+
98+
// Use KVM syscall (kvm_set_device_attr) to register the vCPU with the steal_time region
99+
let vcpu_device_attr = kvm_bindings::kvm_device_attr {
100+
group: KVM_ARM_VCPU_PVTIME_CTRL,
101+
attr: KVM_ARM_VCPU_PVTIME_IPA as u64,
102+
addr: &ipa.0 as *const u64 as u64, // userspace address of attr data
103+
flags: 0,
104+
};
105+
106+
vcpu_fd
107+
.set_device_attr(&vcpu_device_attr)
108+
.map_err(PVTimeError::DeviceAttribute)?;
109+
110+
Ok(())
111+
}
112+
113+
/// Register all vCPUs with their pre-allocated steal time regions
114+
pub fn register_all_vcpus(&self, vcpus: &mut [Vcpu]) -> Result<(), PVTimeError> {
115+
// Register the vcpu with the pvtime device to map its steal time region
116+
for (i, vcpu) in vcpus.iter().enumerate() {
117+
#[allow(clippy::cast_possible_truncation)]
118+
// We know vcpu_count is u8 according to VcpuConfig
119+
self.register_vcpu(i as u8, &vcpu.kvm_vcpu.fd)?;
120+
}
121+
Ok(())
122+
}
123+
}
124+
125+
/// Logic to save/restore the state of a PVTime device
126+
#[derive(Default, Debug, Clone, Serialize, Deserialize)]
127+
pub struct PVTimeState {
128+
/// base IPA of the total shared memory region
129+
pub base_ipa: u64,
130+
}
131+
132+
/// Arguments to restore a PVTime device from PVTimeState
133+
#[derive(Debug)]
134+
pub struct PVTimeConstructorArgs<'a> {
135+
/// For steal_time shared memory region
136+
pub resource_allocator: &'a mut ResourceAllocator,
137+
/// Number of vCPUs (should be consistent with pre-snapshot state)
138+
pub vcpu_count: u8,
139+
}
140+
141+
impl<'a> Persist<'a> for PVTime {
142+
type State = PVTimeState;
143+
type ConstructorArgs = PVTimeConstructorArgs<'a>;
144+
type Error = PVTimeError;
145+
146+
/// Save base IPA of PVTime device for persistence
147+
fn save(&self) -> Self::State {
148+
PVTimeState {
149+
base_ipa: self.base_ipa.0,
150+
}
151+
}
152+
153+
/// Restore state of PVTime device from given base IPA
154+
fn restore(
155+
constructor_args: Self::ConstructorArgs,
156+
state: &Self::State,
157+
) -> std::result::Result<Self, PVTimeError> {
158+
constructor_args
159+
.resource_allocator
160+
.allocate_system_memory(
161+
STEALTIME_STRUCT_MEM_SIZE * constructor_args.vcpu_count as u64,
162+
STEALTIME_STRUCT_MEM_SIZE,
163+
vm_allocator::AllocPolicy::ExactMatch(state.base_ipa),
164+
)
165+
.map_err(PVTimeError::AllocationFailed)?;
166+
Ok(Self::from_base(
167+
GuestAddress(state.base_ipa),
168+
constructor_args.vcpu_count,
169+
))
170+
}
171+
}

src/vmm/src/builder.rs

Lines changed: 63 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
// SPDX-License-Identifier: Apache-2.0
33

44
//! Enables pre-boot setup, instantiation and booting of a Firecracker VMM.
5-
65
use std::fmt::Debug;
76
use std::io;
87
#[cfg(feature = "gdb")]
@@ -12,13 +11,16 @@ use std::sync::{Arc, Mutex};
1211
use event_manager::{MutEventSubscriber, SubscriberOps};
1312
use libc::EFD_NONBLOCK;
1413
use linux_loader::cmdline::Cmdline as LoaderKernelCmdline;
14+
use log::warn;
1515
use userfaultfd::Uffd;
1616
use utils::time::TimestampUs;
1717
#[cfg(target_arch = "aarch64")]
1818
use vm_superio::Rtc;
1919
use vm_superio::Serial;
2020
use vmm_sys_util::eventfd::EventFd;
2121

22+
#[cfg(target_arch = "aarch64")]
23+
use crate::arch::aarch64::pvtime::{PVTime, PVTimeConstructorArgs, PVTimeError};
2224
use crate::arch::{ConfigurationError, configure_system_for_boot, load_kernel};
2325
#[cfg(target_arch = "aarch64")]
2426
use crate::construct_kvm_mpidrs;
@@ -82,6 +84,8 @@ pub enum StartMicrovmError {
8284
CreateLegacyDevice(device_manager::legacy::LegacyDeviceError),
8385
/// Error creating VMGenID device: {0}
8486
CreateVMGenID(VmGenIdError),
87+
/// Error creating PVTime device: {0}
88+
CreatePVTime(PVTimeError),
8589
/// Invalid Memory Configuration: {0}
8690
GuestMemory(crate::vstate::memory::MemoryError),
8791
/// Error with initrd initialization: {0}.
@@ -189,6 +193,8 @@ fn create_vmm_and_vcpus(
189193
#[cfg(target_arch = "x86_64")]
190194
pio_device_manager,
191195
acpi_device_manager,
196+
#[cfg(target_arch = "aarch64")]
197+
pv_time: None,
192198
};
193199

194200
Ok((vmm, vcpus))
@@ -289,6 +295,17 @@ pub fn build_microvm_for_boot(
289295

290296
attach_vmgenid_device(&mut vmm)?;
291297

298+
// Attempt to setup PVTime, continue if not supported
299+
#[cfg(target_arch = "aarch64")]
300+
{
301+
vmm.pv_time = if PVTime::is_supported(&vcpus[0].kvm_vcpu.fd) {
302+
Some(setup_pv_time(&mut vmm, vcpus.as_mut())?)
303+
} else {
304+
warn!("PVTime is not supported by KVM. Steal time will not be reported to the guest.");
305+
None
306+
};
307+
}
308+
292309
configure_system_for_boot(
293310
&mut vmm,
294311
vcpus.as_mut(),
@@ -403,6 +420,8 @@ pub enum BuildMicrovmFromSnapshotError {
403420
ACPIDeviManager(#[from] ACPIDeviceManagerRestoreError),
404421
/// VMGenID update failed: {0}
405422
VMGenIDUpdate(std::io::Error),
423+
/// Failed to restore PVTime device: {0}
424+
RestorePVTime(#[from] PVTimeError),
406425
}
407426

408427
/// Builds and starts a microVM based on the provided MicrovmState.
@@ -453,6 +472,29 @@ pub fn build_microvm_from_snapshot(
453472
.map_err(BuildMicrovmFromSnapshotError::RestoreVcpus)?;
454473
}
455474

475+
// Restore the PVTime device
476+
#[cfg(target_arch = "aarch64")]
477+
{
478+
let pvtime_state = microvm_state.pvtime_state;
479+
if let Some(pvtime_state) = pvtime_state {
480+
#[allow(clippy::cast_possible_truncation)]
481+
// We know vcpu_count is u8 according to VcpuConfig
482+
let pvtime_ctor_args = PVTimeConstructorArgs {
483+
resource_allocator: &mut vmm.resource_allocator,
484+
vcpu_count: vcpus.len() as u8,
485+
};
486+
vmm.pv_time = Some(
487+
PVTime::restore(pvtime_ctor_args, &pvtime_state)
488+
.map_err(BuildMicrovmFromSnapshotError::RestorePVTime)?,
489+
);
490+
vmm.pv_time
491+
.as_ref()
492+
.unwrap()
493+
.register_all_vcpus(&mut vcpus) // We can safely unwrap here
494+
.map_err(StartMicrovmError::CreatePVTime)?;
495+
}
496+
}
497+
456498
#[cfg(target_arch = "aarch64")]
457499
{
458500
let mpidrs = construct_kvm_mpidrs(&microvm_state.vcpu_states);
@@ -548,6 +590,24 @@ pub fn setup_serial_device(
548590
Ok(serial)
549591
}
550592

593+
/// Sets up the pvtime device.
594+
#[cfg(target_arch = "aarch64")]
595+
fn setup_pv_time(vmm: &mut Vmm, vcpus: &mut [Vcpu]) -> Result<PVTime, StartMicrovmError> {
596+
use crate::arch::aarch64::pvtime::PVTime;
597+
598+
// Create the pvtime device
599+
#[allow(clippy::cast_possible_truncation)] // We know vcpu_count is u8 according to VcpuConfig
600+
let pv_time = PVTime::new(&mut vmm.resource_allocator, vcpus.len() as u8)
601+
.map_err(StartMicrovmError::CreatePVTime)?;
602+
603+
// Register all vcpus with pvtime device
604+
pv_time
605+
.register_all_vcpus(vcpus)
606+
.map_err(StartMicrovmError::CreatePVTime)?;
607+
608+
Ok(pv_time)
609+
}
610+
551611
#[cfg(target_arch = "aarch64")]
552612
fn attach_legacy_devices_aarch64(
553613
event_manager: &mut EventManager,
@@ -851,6 +911,8 @@ pub(crate) mod tests {
851911
#[cfg(target_arch = "x86_64")]
852912
pio_device_manager,
853913
acpi_device_manager,
914+
#[cfg(target_arch = "aarch64")]
915+
pv_time: None,
854916
}
855917
}
856918

src/vmm/src/lib.rs

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,8 @@ use vstate::kvm::Kvm;
134134
use vstate::vcpu::{self, StartThreadedError, VcpuSendEventError};
135135

136136
use crate::arch::DeviceType;
137+
#[cfg(target_arch = "aarch64")]
138+
use crate::arch::aarch64::pvtime::PVTime;
137139
use crate::cpu_config::templates::CpuConfiguration;
138140
#[cfg(target_arch = "x86_64")]
139141
use crate::device_manager::legacy::PortIODeviceManager;
@@ -157,7 +159,6 @@ use crate::vstate::memory::{
157159
use crate::vstate::vcpu::VcpuState;
158160
pub use crate::vstate::vcpu::{Vcpu, VcpuConfig, VcpuEvent, VcpuHandle, VcpuResponse};
159161
pub use crate::vstate::vm::Vm;
160-
161162
/// Shorthand type for the EventManager flavour used by Firecracker.
162163
pub type EventManager = BaseEventManager<Arc<Mutex<dyn MutEventSubscriber>>>;
163164

@@ -324,6 +325,8 @@ pub struct Vmm {
324325
#[cfg(target_arch = "x86_64")]
325326
pio_device_manager: PortIODeviceManager,
326327
acpi_device_manager: ACPIDeviceManager,
328+
#[cfg(target_arch = "aarch64")]
329+
pv_time: Option<PVTime>,
327330
}
328331

329332
impl Vmm {
@@ -523,6 +526,9 @@ impl Vmm {
523526

524527
let memory_state = self.guest_memory.describe();
525528
let acpi_dev_state = self.acpi_device_manager.save();
529+
#[cfg(target_arch = "aarch64")]
530+
let pvtime_state: Option<arch::aarch64::pvtime::PVTimeState> =
531+
self.pv_time.as_ref().map(|pvtime| pvtime.save());
526532

527533
Ok(MicrovmState {
528534
vm_info: vm_info.clone(),
@@ -532,6 +538,8 @@ impl Vmm {
532538
vcpu_states,
533539
device_states,
534540
acpi_dev_state,
541+
#[cfg(target_arch = "aarch64")]
542+
pvtime_state,
535543
})
536544
}
537545

src/vmm/src/persist.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ use serde::{Deserialize, Serialize};
1616
use userfaultfd::{FeatureFlags, Uffd, UffdBuilder};
1717
use vmm_sys_util::sock_ctrl_msg::ScmSocket;
1818

19+
#[cfg(target_arch = "aarch64")]
20+
use crate::arch::aarch64::pvtime::PVTimeState;
1921
#[cfg(target_arch = "aarch64")]
2022
use crate::arch::aarch64::vcpu::get_manufacturer_id_from_host;
2123
use crate::builder::{self, BuildMicrovmFromSnapshotError};
@@ -88,6 +90,9 @@ pub struct MicrovmState {
8890
pub device_states: DeviceStates,
8991
/// ACPI devices state.
9092
pub acpi_dev_state: ACPIDeviceManagerState,
93+
/// PVTime device state (optional for platforms that support it).
94+
#[cfg(target_arch = "aarch64")]
95+
pub pvtime_state: Option<PVTimeState>,
9196
}
9297

9398
/// This describes the mapping between Firecracker base virtual address and
@@ -765,6 +770,8 @@ mod tests {
765770
#[cfg(target_arch = "x86_64")]
766771
vm_state: vmm.vm.save_state().unwrap(),
767772
acpi_dev_state: vmm.acpi_device_manager.save(),
773+
#[cfg(target_arch = "aarch64")]
774+
pvtime_state: None,
768775
};
769776

770777
let mut buf = vec![0; 10000];

0 commit comments

Comments
 (0)