Skip to content

Commit e1de617

Browse files
committed
feat: Request permission for Intel AMX
Intel AMX (Advanced Matrix Extensions) was introduced in Intel Sapphire Rapids to accelerate deep learning and AI workloads. Since it requires a larger area to save its state, the TILEDATA feature is disabled by default. We request permission for it by default because it can be disabled via CPU template. Otherwise, kernels prior to v6.4 have a bug where KVM_GET_SUPPORTED_CPUID returns an inconsistent state of TILECFG enabled but TILEDATA disabled by default, causing guest's #GP fault on xsetbv instruction. Signed-off-by: Takahiro Itazuri <[email protected]>
1 parent d6a52b8 commit e1de617

File tree

2 files changed

+129
-1
lines changed

2 files changed

+129
-1
lines changed

src/vmm/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ license = "Apache-2.0"
99
bench = false
1010

1111
[dependencies]
12-
acpi_tables = { path = "../acpi-tables" }
12+
acpi_tables = { path = "../acpi-tables" }
1313
aes-gcm = { version = "0.10.1", default-features = false, features = ["aes"] }
1414
arrayvec = { version = "0.7.6", optional = true }
1515
aws-lc-rs = { version = "1.12.4", features = ["bindgen"] }

src/vmm/src/vstate/kvm.rs

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,11 @@ use kvm_bindings::KVM_API_VERSION;
66
use kvm_bindings::{CpuId, MsrList, KVM_MAX_CPUID_ENTRIES};
77
use kvm_ioctls::Kvm as KvmFd;
88
use serde::{Deserialize, Serialize};
9+
#[cfg(target_arch = "x86_64")]
10+
use vmm_sys_util::syscall::SyscallReturnCode;
911

12+
#[cfg(target_arch = "x86_64")]
13+
use crate::arch::x86_64::gen::arch_prctl;
1014
use crate::cpu_config::templates::KvmCapability;
1115
use crate::vstate::memory::{GuestMemory, GuestMemoryMmap};
1216

@@ -25,8 +29,14 @@ pub enum KvmError {
2529
#[cfg(target_arch = "x86_64")]
2630
/// Failed to get supported cpuid: {0}
2731
GetSupportedCpuId(kvm_ioctls::Error),
32+
#[cfg(target_arch = "x86_64")]
33+
/// Failed to get supported XSTATE features: {0}
34+
GetSupportedXstateFeatures(std::io::Error),
2835
/// The number of configured slots is bigger than the maximum reported by KVM
2936
NotEnoughMemorySlots,
37+
#[cfg(target_arch = "x86_64")]
38+
/// Failed to request permission for a XSTATE feature ({0}): {1}
39+
RequestXstateFeatures(u32, std::io::Error),
3040
}
3141

3242
/// Struct with kvm fd and kvm associated paramenters.
@@ -73,6 +83,15 @@ impl Kvm {
7383

7484
#[cfg(target_arch = "x86_64")]
7585
{
86+
// Request permission for Intel AMX (Advanced Matrix Extensions) TILEDATA.
87+
//
88+
// Unless requested, on kernels prior to v6.4, KVM_GET_SUPPORTED_CPUID returns an
89+
// inconsistent state where TILECFG is set but TILEDATA isn't. Such a half-enabled state
90+
// causes guest crash during boot because a guest calls XSETBV instruction with all
91+
// XSAVE feature bits enumerated on CPUID and XSETBV only accepts either of both Intel
92+
// AMX bits enabled or disabled; otherwise resulting in general protection fault.
93+
Self::request_xstate_feature_permission(arch_prctl::ARCH_XCOMP_TILEDATA)?;
94+
7695
let supported_cpuid = kvm_fd
7796
.get_supported_cpuid(KVM_MAX_CPUID_ENTRIES)
7897
.map_err(KvmError::GetSupportedCpuId)?;
@@ -86,6 +105,65 @@ impl Kvm {
86105
}
87106
}
88107

108+
/// Request permission for a dynamic XSTATE features.
109+
///
110+
/// Some XSTATE features are not permitted by default, because they require a larger area
111+
/// to save their states than the traditional 4096-byte area. Instead, the permission for them
112+
/// can be requested via arch_prctl().
113+
/// https://github.com/torvalds/linux/blob/master/Documentation/arch/x86/xstate.rst
114+
///
115+
/// We request permission for them by default if available in order to retrieve the correct
116+
/// supported feature set via KVM_GET_SUPPORTED_CPUID.
117+
/// https://docs.kernel.org/virt/kvm/api.html#kvm-get-supported-cpuid
118+
///
119+
/// Note that such requested features can be disabled by a CPU template and no memory allocation
120+
/// to save their states happens here immediately.
121+
#[cfg(target_arch = "x86_64")]
122+
fn request_xstate_feature_permission(xfeature: u32) -> Result<(), KvmError> {
123+
// Get the supported dynamic xstate features.
124+
let mut supported_xfeatures: libc::c_ulong = 0;
125+
// SAFETY: Safe because the third input (`addr`) is a valid `c_ulong`` pointer.
126+
// https://man7.org/linux/man-pages/man2/arch_prctl.2.html
127+
SyscallReturnCode(unsafe {
128+
libc::syscall(
129+
libc::SYS_arch_prctl,
130+
arch_prctl::ARCH_GET_XCOMP_SUPP,
131+
&mut supported_xfeatures as *mut libc::c_ulong,
132+
)
133+
})
134+
.into_empty_result()
135+
.or_else(|err| {
136+
if err.raw_os_error() == Some(libc::EINVAL) {
137+
// EINVAL is returned if the dynamic XSTATE feature enabling is not supported (e.g.
138+
// kernel version prior to v5.17).
139+
// https://github.com/torvalds/linux/commit/980fe2fddcff21937c93532b4597c8ea450346c1
140+
//
141+
// `supported_xfeatures` remains 0 here, so will skip permission request.
142+
Ok(())
143+
} else {
144+
Err(err)
145+
}
146+
})
147+
.map_err(KvmError::GetSupportedXstateFeatures)?;
148+
149+
// Request permission for the given XSTATE feature only if available
150+
let xfeature_mask: libc::c_ulong = 1u64 << xfeature;
151+
if (supported_xfeatures & xfeature_mask) == xfeature_mask {
152+
// SAFETY: Safe because all inputs are valid as `c_ulong`` values.
153+
SyscallReturnCode(unsafe {
154+
libc::syscall(
155+
libc::SYS_arch_prctl,
156+
arch_prctl::ARCH_REQ_XCOMP_GUEST_PERM,
157+
xfeature,
158+
)
159+
})
160+
.into_empty_result()
161+
.map_err(|err| KvmError::RequestXstateFeatures(xfeature, err))?;
162+
}
163+
164+
Ok(())
165+
}
166+
89167
/// Msrs needed to be saved on snapshot creation.
90168
#[cfg(target_arch = "x86_64")]
91169
pub fn msrs_to_save(&self) -> Result<MsrList, crate::arch::x86_64::msr::MsrError> {
@@ -215,4 +293,54 @@ pub(crate) mod tests {
215293
.iter()
216294
.any(|c| *c == kvm_bindings::KVM_CAP_IOEVENTFD));
217295
}
296+
297+
#[cfg(target_arch = "x86_64")]
298+
#[test]
299+
fn test_request_xstate_feature_permission() {
300+
// Test request_xstate_feature_permission() for Intel AMX.
301+
Kvm::request_xstate_feature_permission(arch_prctl::ARCH_XCOMP_TILEDATA).unwrap();
302+
303+
let mut supported_xfeatures: libc::c_ulong = 0;
304+
// SAFETY: Safe because the third input (`addr`) is a valid `c_ulong` pointer.
305+
match SyscallReturnCode(unsafe {
306+
libc::syscall(
307+
libc::SYS_arch_prctl,
308+
arch_prctl::ARCH_GET_XCOMP_SUPP,
309+
&mut supported_xfeatures as *mut libc::c_ulong,
310+
)
311+
})
312+
.into_empty_result()
313+
{
314+
Ok(_) => {} // Continue this test
315+
Err(err) if err.raw_os_error() == Some(libc::EINVAL) => {
316+
// Dynamic XSTATE feature enabling is not supported in the first place, so nothing
317+
// to test on this kernel version.
318+
return;
319+
}
320+
Err(err) => panic!("Unexpected error: {}", err),
321+
};
322+
323+
// If Intel AMX is not supported, nothing to test on this CPU.
324+
let intel_amx_feature_mask: libc::c_ulong = 1u64 << arch_prctl::ARCH_XCOMP_TILEDATA;
325+
if supported_xfeatures & intel_amx_feature_mask != intel_amx_feature_mask {
326+
return;
327+
}
328+
329+
let mut permitted_xfeatures: libc::c_ulong = 0;
330+
// SAFETY: Safe because the third input (`addr`) is a valid `c_ulong` pointer.
331+
SyscallReturnCode(unsafe {
332+
libc::syscall(
333+
libc::SYS_arch_prctl,
334+
arch_prctl::ARCH_GET_XCOMP_GUEST_PERM,
335+
&mut permitted_xfeatures as *mut libc::c_ulong,
336+
)
337+
})
338+
.into_empty_result()
339+
.unwrap();
340+
341+
assert_eq!(
342+
permitted_xfeatures & intel_amx_feature_mask,
343+
intel_amx_feature_mask
344+
);
345+
}
218346
}

0 commit comments

Comments
 (0)