Skip to content

Commit faed3d3

Browse files
committed
feat(vmm): Use structs and methods for KVM_CAP_XSAVE2
Intel AMX is an XSTATE feature and TILEDATA is disabled by default because it requires a larger area to save its state than the traditional 4096 bytes. Instead, Linux kernel allows VMMs to request the guest permission via `arch_prctl()`. As such, the size of the XSTATE buffer required to save XSTASTE is dynamic. To support dynamically-sized buffer, `KVM_CAP_XSAVE2` was introduced with `KVM_GET_XSAVE2`. Accordingly, kvm-bindings added `Xsave` that is an alias of `FamStructWrapper` for the `kvm_xsave` struct with FAM in the end, and kvm-ioctls added `get_xsave2()` for `KVM_GET_XSAVE2` and `set_xsave2()` to take `Xsave` to call `KVM_SET_XSAVE`. Change the type of `xsave` in `VcpuState` from `kvm_xsave` to `Xsave`. Use `get_xsave2()` and `set_xsave2()`. Signed-off-by: Takahiro Itazuri <[email protected]>
1 parent 8ae689f commit faed3d3

File tree

4 files changed

+127
-17
lines changed

4 files changed

+127
-17
lines changed

Cargo.lock

Lines changed: 4 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/vmm/Cargo.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@ displaydoc = "0.2.5"
2222
event-manager = "0.4.0"
2323
gdbstub = { version = "0.7.3", optional = true }
2424
gdbstub_arch = { version = "0.3.1", optional = true }
25-
kvm-bindings = { version = "0.11.0", features = ["fam-wrappers", "serde"] }
26-
kvm-ioctls = "0.20.0"
25+
kvm-bindings = { version = "0.11.1", features = ["fam-wrappers", "serde"] }
26+
kvm-ioctls = "0.21.0"
2727
libc = "0.2.170"
2828
linux-loader = "0.13.0"
2929
log = { version = "0.4.26", features = ["std", "serde"] }

src/vmm/src/vstate/vcpu/x86_64.rs

Lines changed: 84 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,12 @@ use std::fmt::Debug;
1010

1111
use kvm_bindings::{
1212
kvm_debugregs, kvm_lapic_state, kvm_mp_state, kvm_regs, kvm_sregs, kvm_vcpu_events, kvm_xcrs,
13-
kvm_xsave, CpuId, Msrs, KVM_MAX_CPUID_ENTRIES, KVM_MAX_MSR_ENTRIES,
13+
kvm_xsave, kvm_xsave2, CpuId, Msrs, Xsave, KVM_MAX_CPUID_ENTRIES, KVM_MAX_MSR_ENTRIES,
1414
};
1515
use kvm_ioctls::{VcpuExit, VcpuFd};
1616
use log::{error, warn};
1717
use serde::{Deserialize, Serialize};
18-
use vmm_sys_util::fam;
18+
use vmm_sys_util::fam::{self, FamStruct};
1919

2020
use crate::arch::x86_64::gen::msr_index::{MSR_IA32_TSC, MSR_IA32_TSC_DEADLINE};
2121
use crate::arch::x86_64::interrupts;
@@ -74,8 +74,10 @@ pub enum KvmVcpuError {
7474
VcpuGetVcpuEvents(kvm_ioctls::Error),
7575
/// Failed to get KVM vcpu xcrs: {0}
7676
VcpuGetXcrs(kvm_ioctls::Error),
77-
/// Failed to get KVM vcpu xsave: {0}
77+
/// Failed to get KVM vcpu xsave via KVM_GET_XSAVE: {0}
7878
VcpuGetXsave(kvm_ioctls::Error),
79+
/// Failed to get KVM vcpu xsave via KVM_GET_XSAVE2: {0}
80+
VcpuGetXsave2(kvm_ioctls::Error),
7981
/// Failed to get KVM vcpu cpuid: {0}
8082
VcpuGetCpuid(kvm_ioctls::Error),
8183
/// Failed to get KVM TSC frequency: {0}
@@ -147,6 +149,10 @@ pub struct KvmVcpu {
147149
/// The list of MSRs to include in a VM snapshot, in the same order as KVM returned them
148150
/// from KVM_GET_MSR_INDEX_LIST
149151
msrs_to_save: Vec<u32>,
152+
/// Size in bytes requiring to hold the dynamically-sized `kvm_xsave` struct.
153+
///
154+
/// `None` if `KVM_CAP_XSAVE2` not supported.
155+
xsave2_size: Option<usize>,
150156
}
151157

152158
/// Vcpu peripherals
@@ -176,6 +182,7 @@ impl KvmVcpu {
176182
fd: kvm_vcpu,
177183
peripherals: Default::default(),
178184
msrs_to_save: vm.msrs_to_save().to_vec(),
185+
xsave2_size: vm.xsave2_size(),
179186
})
180187
}
181188

@@ -263,6 +270,66 @@ impl KvmVcpu {
263270
self.peripherals.pio_bus = Some(pio_bus);
264271
}
265272

273+
/// Get the current XSAVE state for this vCPU.
274+
///
275+
/// The C `kvm_xsave` struct was extended by adding a flexible array member (FAM) in the end
276+
/// to support variable-sized XSTATE buffer.
277+
///
278+
/// https://elixir.bootlin.com/linux/v6.13.6/source/arch/x86/include/uapi/asm/kvm.h#L381
279+
/// ```c
280+
/// struct kvm_xsave {
281+
/// __u32 region[1024];
282+
/// __u32 extra[];
283+
/// };
284+
/// ```
285+
///
286+
/// As shown above, the C `kvm_xsave` struct does not have any field for the size of itself or
287+
/// the length of its FAM. The required size (in bytes) of `kvm_xsave` struct can be retrieved
288+
/// via `KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)`.
289+
///
290+
/// kvm-bindings defines `kvm_xsave2` struct that wraps the `kvm_xsave` struct to have `len`
291+
/// field that indicates the number of FAM entries (i.e. `extra`), it also defines `Xsave` as
292+
/// a `FamStructWrapper` of `kvm_xsave2`.
293+
///
294+
/// https://github.com/rust-vmm/kvm/blob/68fff5491703bf32bd35656f7ba994a4cae9ea7d/kvm-bindings/src/x86_64/fam_wrappers.rs#L106
295+
/// ```rs
296+
/// pub struct kvm_xsave2 {
297+
/// pub len: usize,
298+
/// pub xsave: kvm_xsave,
299+
/// }
300+
/// ```
301+
fn get_xsave(&self) -> Result<Xsave, KvmVcpuError> {
302+
match self.xsave2_size {
303+
// if `KVM_CAP_XSAVE2` supported
304+
Some(xsave2_size) => {
305+
// Convert the `kvm_xsave` size in bytes to the length of FAM (i.e. `extra`).
306+
let fam_len =
307+
// Calculate the size of FAM (`extra`) area in bytes. Note that the subtraction
308+
// never underflows because `KVM_CHECK_EXTENSION(KVM_CAP_XSAVE2)` always returns
309+
// at least 4096 bytes that is the size of `kvm_xsave` without FAM area.
310+
(xsave2_size - std::mem::size_of::<kvm_xsave>())
311+
// Divide by the size of FAM (`extra`) entry (i.e. `__u32`).
312+
.div_ceil(std::mem::size_of::<<kvm_xsave2 as FamStruct>::Entry>());
313+
let mut xsave = Xsave::new(fam_len).map_err(KvmVcpuError::Fam)?;
314+
// SAFETY: Safe because `xsave` is allocated with enough size to save XSTATE.
315+
unsafe { self.fd.get_xsave2(&mut xsave) }.map_err(KvmVcpuError::VcpuGetXsave2)?;
316+
Ok(xsave)
317+
}
318+
// if `KVM_CAP_XSAVE2` not supported
319+
None => Ok(
320+
// SAFETY: The content is correctly laid out.
321+
unsafe {
322+
Xsave::from_raw(vec![kvm_xsave2 {
323+
// Note that `len` is the number of FAM (`extra`) entries that didn't exist
324+
// on older kernels not supporting `KVM_CAP_XSAVE2`. Thus, it's always zero.
325+
len: 0,
326+
xsave: self.fd.get_xsave().map_err(KvmVcpuError::VcpuGetXsave)?,
327+
}])
328+
},
329+
),
330+
}
331+
}
332+
266333
/// Get the current TSC frequency for this vCPU.
267334
///
268335
/// # Errors
@@ -496,7 +563,7 @@ impl KvmVcpu {
496563
.map_err(KvmVcpuError::VcpuGetMpState)?;
497564
let regs = self.fd.get_regs().map_err(KvmVcpuError::VcpuGetRegs)?;
498565
let sregs = self.fd.get_sregs().map_err(KvmVcpuError::VcpuGetSregs)?;
499-
let xsave = self.fd.get_xsave().map_err(KvmVcpuError::VcpuGetXsave)?;
566+
let xsave = self.get_xsave()?;
500567
let xcrs = self.fd.get_xcrs().map_err(KvmVcpuError::VcpuGetXcrs)?;
501568
let debug_regs = self
502569
.fd
@@ -601,9 +668,17 @@ impl KvmVcpu {
601668
self.fd
602669
.set_sregs(&state.sregs)
603670
.map_err(KvmVcpuError::VcpuSetSregs)?;
604-
self.fd
605-
.set_xsave(&state.xsave)
606-
.map_err(KvmVcpuError::VcpuSetXsave)?;
671+
// SAFETY: Safe unless the snapshot is corrupted.
672+
unsafe {
673+
// kvm-ioctl's `set_xsave2()` can be called even on kernel versions not supporting
674+
// `KVM_CAP_XSAVE2`, because it internally calls `KVM_SET_XSAVE` API that was extended
675+
// by Linux kernel. Thus, `KVM_SET_XSAVE2` API does not exist as a KVM interface.
676+
// However, kvm-ioctl added `set_xsave2()` to allow users to pass `Xsave` instead of the
677+
// older `kvm_xsave`.
678+
self.fd
679+
.set_xsave2(&state.xsave)
680+
.map_err(KvmVcpuError::VcpuSetXsave)?;
681+
}
607682
self.fd
608683
.set_xcrs(&state.xcrs)
609684
.map_err(KvmVcpuError::VcpuSetXcrs)?;
@@ -684,7 +759,7 @@ pub struct VcpuState {
684759
/// Xcrs.
685760
pub xcrs: kvm_xcrs,
686761
/// Xsave.
687-
pub xsave: kvm_xsave,
762+
pub xsave: Xsave,
688763
/// Tsc khz.
689764
pub tsc_khz: Option<u32>,
690765
}
@@ -744,7 +819,7 @@ mod tests {
744819
sregs: Default::default(),
745820
vcpu_events: Default::default(),
746821
xcrs: Default::default(),
747-
xsave: Default::default(),
822+
xsave: Xsave::new(0).unwrap(),
748823
tsc_khz: Some(0),
749824
}
750825
}

src/vmm/src/vstate/vm/x86_64.rs

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ use kvm_bindings::{
77
kvm_clock_data, kvm_irqchip, kvm_pit_config, kvm_pit_state2, MsrList, KVM_CLOCK_TSC_STABLE,
88
KVM_IRQCHIP_IOAPIC, KVM_IRQCHIP_PIC_MASTER, KVM_IRQCHIP_PIC_SLAVE, KVM_PIT_SPEAKER_DUMMY,
99
};
10-
use kvm_ioctls::VmFd;
10+
use kvm_ioctls::{Cap, VmFd};
1111
use serde::{Deserialize, Serialize};
1212

1313
use crate::arch::x86_64::msr::MsrError;
@@ -19,6 +19,8 @@ use crate::vstate::vm::VmError;
1919
#[cfg(target_arch = "x86_64")]
2020
#[derive(Debug, PartialEq, Eq, thiserror::Error, displaydoc::Display)]
2121
pub enum ArchVmError {
22+
/// Failed to check KVM capability (0): {1}
23+
CheckCapability(Cap, kvm_ioctls::Error),
2224
/// Set PIT2 error: {0}
2325
SetPit2(kvm_ioctls::Error),
2426
/// Set clock error: {0}
@@ -48,6 +50,10 @@ pub enum ArchVmError {
4850
pub struct ArchVm {
4951
pub(super) fd: VmFd,
5052
msrs_to_save: MsrList,
53+
/// Size in bytes requiring to hold the dynamically-sized `kvm_xsave` struct.
54+
///
55+
/// `None` if `KVM_CAP_XSAVE2` not supported.
56+
xsave2_size: Option<usize>,
5157
}
5258

5359
impl ArchVm {
@@ -57,10 +63,34 @@ impl ArchVm {
5763

5864
let msrs_to_save = kvm.msrs_to_save().map_err(ArchVmError::GetMsrsToSave)?;
5965

66+
// `KVM_CAP_XSAVE2` was introduced to support dynamically-sized XSTATE buffer in kernel
67+
// v5.17. `KVM_GET_EXTENSION(KVM_CAP_XSAVE2)` returns the required size in byte if
68+
// supported; otherwise returns 0.
69+
// https://github.com/torvalds/linux/commit/be50b2065dfa3d88428fdfdc340d154d96bf6848
70+
//
71+
// Cache the value in order not to call it at each vCPU creation.
72+
let xsave2_size = match fd.check_extension_int(Cap::Xsave2) {
73+
// Catch all negative values just in case although the possible negative retuned value
74+
// of ioctl() is only -1.
75+
i32::MIN..=-1 => {
76+
return Err(VmError::Arch(ArchVmError::CheckCapability(
77+
Cap::Xsave2,
78+
vmm_sys_util::errno::Error::last(),
79+
)));
80+
}
81+
0 => None,
82+
// SAFETY: Safe because negative values are handled above.
83+
ret => Some(usize::try_from(ret).unwrap()),
84+
};
85+
6086
fd.set_tss_address(u64_to_usize(crate::arch::x86_64::layout::KVM_TSS_ADDRESS))
6187
.map_err(ArchVmError::SetTssAddress)?;
6288

63-
Ok(ArchVm { fd, msrs_to_save })
89+
Ok(ArchVm {
90+
fd,
91+
msrs_to_save,
92+
xsave2_size,
93+
})
6494
}
6595

6696
pub(super) fn arch_pre_create_vcpus(&mut self, _: u8) -> Result<(), ArchVmError> {
@@ -162,6 +192,11 @@ impl ArchVm {
162192
pub fn msrs_to_save(&self) -> &[u32] {
163193
self.msrs_to_save.as_slice()
164194
}
195+
196+
/// Gets the size (in bytes) of the `kvm_xsave` struct.
197+
pub fn xsave2_size(&self) -> Option<usize> {
198+
self.xsave2_size
199+
}
165200
}
166201

167202
#[derive(Default, Deserialize, Serialize)]

0 commit comments

Comments
 (0)