diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index f1464a0a585..725190eb3ce 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -9,7 +9,7 @@ license = "Apache-2.0" bench = false [dependencies] -acpi_tables = { path = "../acpi-tables" } +acpi_tables = { path = "../acpi-tables" } aes-gcm = { version = "0.10.1", default-features = false, features = ["aes"] } arrayvec = { version = "0.7.6", optional = true } aws-lc-rs = { version = "1.12.4", features = ["bindgen"] } diff --git a/src/vmm/src/arch/x86_64/gen/arch_prctl.rs b/src/vmm/src/arch/x86_64/gen/arch_prctl.rs new file mode 100644 index 00000000000..768964e494b --- /dev/null +++ b/src/vmm/src/arch/x86_64/gen/arch_prctl.rs @@ -0,0 +1,43 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +// automatically generated by tools/bindgen.sh + +#![allow( + non_camel_case_types, + non_upper_case_globals, + dead_code, + non_snake_case, + clippy::ptr_as_ptr, + clippy::undocumented_unsafe_blocks, + missing_debug_implementations, + clippy::tests_outside_test_module +)] + +pub const ARCH_SET_GS: u32 = 4097; +pub const ARCH_SET_FS: u32 = 4098; +pub const ARCH_GET_FS: u32 = 4099; +pub const ARCH_GET_GS: u32 = 4100; +pub const ARCH_GET_CPUID: u32 = 4113; +pub const ARCH_SET_CPUID: u32 = 4114; +pub const ARCH_GET_XCOMP_SUPP: u32 = 4129; +pub const ARCH_GET_XCOMP_PERM: u32 = 4130; +pub const ARCH_REQ_XCOMP_PERM: u32 = 4131; +pub const ARCH_GET_XCOMP_GUEST_PERM: u32 = 4132; +pub const ARCH_REQ_XCOMP_GUEST_PERM: u32 = 4133; +pub const ARCH_XCOMP_TILECFG: u32 = 17; +pub const ARCH_XCOMP_TILEDATA: u32 = 18; +pub const ARCH_MAP_VDSO_X32: u32 = 8193; +pub const ARCH_MAP_VDSO_32: u32 = 8194; +pub const ARCH_MAP_VDSO_64: u32 = 8195; +pub const ARCH_GET_UNTAG_MASK: u32 = 16385; +pub const ARCH_ENABLE_TAGGED_ADDR: u32 = 16386; +pub const ARCH_GET_MAX_TAG_BITS: u32 = 16387; +pub const ARCH_FORCE_TAGGED_SVA: u32 = 16388; +pub const ARCH_SHSTK_ENABLE: u32 = 20481; +pub const ARCH_SHSTK_DISABLE: u32 = 20482; +pub const ARCH_SHSTK_LOCK: u32 = 20483; +pub const ARCH_SHSTK_UNLOCK: u32 = 20484; +pub const ARCH_SHSTK_STATUS: u32 = 20485; +pub const ARCH_SHSTK_SHSTK: u32 = 1; +pub const ARCH_SHSTK_WRSS: u32 = 2; diff --git a/src/vmm/src/arch/x86_64/gen/mod.rs b/src/vmm/src/arch/x86_64/gen/mod.rs index 35c2ca225e8..bded9e51455 100644 --- a/src/vmm/src/arch/x86_64/gen/mod.rs +++ b/src/vmm/src/arch/x86_64/gen/mod.rs @@ -5,9 +5,9 @@ // Use of this source code is governed by a BSD-style license that can be // found in the THIRD-PARTY file. +pub mod arch_prctl; pub mod hyperv; pub mod hyperv_tlfs; pub mod mpspec; - pub mod msr_index; pub mod perf_event; diff --git a/src/vmm/src/vstate/kvm.rs b/src/vmm/src/vstate/kvm.rs index 20585b337fc..3ff9db9145e 100644 --- a/src/vmm/src/vstate/kvm.rs +++ b/src/vmm/src/vstate/kvm.rs @@ -6,14 +6,18 @@ use kvm_bindings::KVM_API_VERSION; use kvm_bindings::{CpuId, MsrList, KVM_MAX_CPUID_ENTRIES}; use kvm_ioctls::Kvm as KvmFd; use serde::{Deserialize, Serialize}; +#[cfg(target_arch = "x86_64")] +use vmm_sys_util::syscall::SyscallReturnCode; +#[cfg(target_arch = "x86_64")] +use crate::arch::x86_64::gen::arch_prctl; use crate::cpu_config::templates::KvmCapability; use crate::vstate::memory::{GuestMemory, GuestMemoryMmap}; /// Errors associated with the wrappers over KVM ioctls. /// Needs `rustfmt::skip` to make multiline comments work #[rustfmt::skip] -#[derive(Debug, PartialEq, Eq, thiserror::Error, displaydoc::Display)] +#[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum KvmError { /// The host kernel reports an invalid KVM API version: {0} ApiVersion(i32), @@ -25,8 +29,14 @@ pub enum KvmError { #[cfg(target_arch = "x86_64")] /// Failed to get supported cpuid: {0} GetSupportedCpuId(kvm_ioctls::Error), + #[cfg(target_arch = "x86_64")] + /// Failed to get supported XSTATE features: {0} + GetSupportedXstateFeatures(std::io::Error), /// The number of configured slots is bigger than the maximum reported by KVM NotEnoughMemorySlots, + #[cfg(target_arch = "x86_64")] + /// Failed to enable XSTATE features ({0:#b}): {1} + RequestXstateFeatures(u32, std::io::Error), } /// Struct with kvm fd and kvm associated paramenters. @@ -73,6 +83,8 @@ impl Kvm { #[cfg(target_arch = "x86_64")] { + Self::enable_intel_amx()?; + let supported_cpuid = kvm_fd .get_supported_cpuid(KVM_MAX_CPUID_ENTRIES) .map_err(KvmError::GetSupportedCpuId)?; @@ -86,6 +98,74 @@ impl Kvm { } } + #[cfg(target_arch = "x86_64")] + // XSTATE feature mask for Intel AMX. + const INTEL_AMX_XCOMP_MASK: libc::c_ulong = + (1u64 << arch_prctl::ARCH_XCOMP_TILECFG) | (1u64 << arch_prctl::ARCH_XCOMP_TILEDATA); + + /// Enable Intel AMX if available. + /// + /// Intel AMX (Advanced Matrix Extensions) is an instruction set for AI workloads that was + /// introduced in Intel Sapphire Rapids (*7i.metal). Since it requires larger area to save the + /// state, it is disabled by default. + /// https://github.com/torvalds/linux/blob/master/Documentation/arch/x86/xstate.rst + /// + /// We enable it by default but can be disabled by CPU template; otherwise, + /// KVM_GET_SUPPORTED_CPUID returns a inconsistent state where TILECFG is enabled but TILEDATA + /// is disabled, causing guest's #GP fault on xsetbv due to the lack of sanity check. + /// https://lore.kernel.org/all/20230405004520.421768-1-seanjc@google.com/ + /// + /// Dynamically-enabled feature bits need to be requested with arch_prctl() before calling + /// KVM_GET_SUPPORTED_CPUID. Feature bits that have not been requested are excluded from the + /// result of KVM_GET_SUPPORTED_CPUID. + /// https://docs.kernel.org/virt/kvm/api.html + /// + /// Note that no memory allocation to save Intel AMX state happens here immediately. + #[cfg(target_arch = "x86_64")] + fn enable_intel_amx() -> Result<(), KvmError> { + // Get the supported xstate features. + let mut supported_xfeatures: libc::c_ulong = 0; + // SAFETY: Safe because the second input (`op`) might not be valid for unsupported kernels + // but EINVAL is handled later, and the third input (`addr`) is a valid c_ulong pointer. + // https://man7.org/linux/man-pages/man2/arch_prctl.2.html + SyscallReturnCode(unsafe { + libc::syscall( + libc::SYS_arch_prctl, + arch_prctl::ARCH_GET_XCOMP_SUPP, + &mut supported_xfeatures as *mut libc::c_ulong, + ) + }) + .into_empty_result() + .or_else(|err| { + // EINVAL is returned if ARCH_GET_XCOMP_SUPP is not supported (e.g. kernel versions + // prior to v5.17). + // https://github.com/torvalds/linux/commit/980fe2fddcff21937c93532b4597c8ea450346c1 + if err.raw_os_error() == Some(libc::EINVAL) { + Ok(()) + } else { + Err(err) + } + }) + .map_err(KvmError::GetSupportedXstateFeatures)?; + + // Enable Intel AMX if supported. + if (supported_xfeatures & Self::INTEL_AMX_XCOMP_MASK) == Self::INTEL_AMX_XCOMP_MASK { + // SAFETY: Safe because ARCH_REQ_XCOMP_GUEST_PERM is supported if ARCH_GET_XCOMP_SUPP is + // supported and it has been confirmed that ARCH_XCOMP_TILEDATA is supported. + SyscallReturnCode(unsafe { + libc::syscall( + libc::SYS_arch_prctl, + arch_prctl::ARCH_REQ_XCOMP_GUEST_PERM, + arch_prctl::ARCH_XCOMP_TILEDATA, + ) + }) + .into_empty_result() + .map_err(|err| KvmError::RequestXstateFeatures(arch_prctl::ARCH_XCOMP_TILEDATA, err))?; + } + + Ok(()) + } + /// Msrs needed to be saved on snapshot creation. #[cfg(target_arch = "x86_64")] pub fn msrs_to_save(&self) -> Result { @@ -215,4 +295,99 @@ pub(crate) mod tests { .iter() .any(|c| *c == kvm_bindings::KVM_CAP_IOEVENTFD)); } + + #[cfg(target_arch = "x86_64")] + mod x86_64 { + use super::*; + use crate::arch::x86_64::cpu_model::CpuModel; + + #[derive(PartialEq, PartialOrd)] + struct KernelVersion(u32, u32); + + impl KernelVersion { + fn current() -> Self { + let version_str = std::fs::read_to_string("/proc/sys/kernel/osrelease").unwrap(); + let mut parts = version_str.trim().split('.'); + + let major = parts.next().unwrap().parse::().unwrap(); + let minor = parts.next().unwrap().parse::().unwrap(); + + KernelVersion(major, minor) + } + } + + #[derive(PartialEq)] + enum Vendor { + Intel, + Amd, + } + + impl Vendor { + fn new() -> Self { + let vendor_id = Self::get_vendor_id_str(); + match vendor_id.as_str() { + "GenuineIntel" => Vendor::Intel, + "AuthenticAMD" => Vendor::Amd, + _ => panic!("Unknown vendor_id: {}", vendor_id), + } + } + + fn get_vendor_id_str() -> String { + let cpuinfo = std::fs::read_to_string("/proc/cpuinfo").unwrap(); + + for line in cpuinfo.lines() { + if line.starts_with("vendor_id") { + return line + .split(':') + .nth(1) + .map(|s| s.trim().to_string()) + .unwrap(); + } + } + panic!("`vendor_id` not found in /proc/cpuinfo"); + } + } + + #[cfg(target_arch = "x86_64")] + #[test] + fn test_enable_intel_amx() { + Kvm::enable_intel_amx().unwrap(); + + // ARCH_{REQ,GET}_XCOMP_GUEST_PERM were added in kernel v5.17. + // https://github.com/torvalds/linux/commit/980fe2fddcff21937c93532b4597c8ea450346c1 + let supported_version = KernelVersion(5, 17); + let current_version = KernelVersion::current(); + + if current_version >= supported_version { + let mut permitted_xfeatures: libc::c_ulong = 0; + // SAFETY: Safe because the second input (`op`) should be valid on kernel v5.17+, + // and the third input (`addr`) is a valid `c_ulong` pointer. + SyscallReturnCode(unsafe { + libc::syscall( + libc::SYS_arch_prctl, + arch_prctl::ARCH_GET_XCOMP_GUEST_PERM, + &mut permitted_xfeatures as *mut libc::c_ulong, + ) + }) + .into_empty_result() + .unwrap(); + + // Intel AMX is available only on Intel processors now. + let vendor = Vendor::new(); + + // Intel AMX is introduced in Intel Sapphire Rapids (CPUID.01H:EAX = 0x000806f8). + let supported_cpu = CpuModel::from(&0x000806f8); + let current_cpu = CpuModel::get_cpu_model(); + + if current_cpu >= supported_cpu && vendor == Vendor::Intel { + assert_eq!( + permitted_xfeatures & Kvm::INTEL_AMX_XCOMP_MASK, + Kvm::INTEL_AMX_XCOMP_MASK + ); + } else { + assert_eq!(permitted_xfeatures & Kvm::INTEL_AMX_XCOMP_MASK, 0); + } + } + } + } } diff --git a/tests/framework/utils_cpuid.py b/tests/framework/utils_cpuid.py index 979f4478c8b..a3988bf7f85 100644 --- a/tests/framework/utils_cpuid.py +++ b/tests/framework/utils_cpuid.py @@ -32,6 +32,7 @@ class CpuModel(str, Enum): INTEL_SKYLAKE = "INTEL_SKYLAKE" INTEL_CASCADELAKE = "INTEL_CASCADELAKE" INTEL_ICELAKE = "INTEL_ICELAKE" + INTEL_SAPPHIRE_RAPIDS = "INTEL_SAPPHIRE_RAPIDS" CPU_DICT = { @@ -40,6 +41,7 @@ class CpuModel(str, Enum): "Intel(R) Xeon(R) Platinum 8124M CPU": "INTEL_SKYLAKE", "Intel(R) Xeon(R) Platinum 8259CL CPU": "INTEL_CASCADELAKE", "Intel(R) Xeon(R) Platinum 8375C CPU": "INTEL_ICELAKE", + "Intel(R) Xeon(R) Platinum 8488C": "INTEL_SAPPHIRE_RAPIDS", }, CpuVendor.AMD: {"AMD EPYC 7R13": "AMD_MILAN", "AMD EPYC 9R14": "AMD_GENOA"}, CpuVendor.ARM: { @@ -83,6 +85,8 @@ def get_cpu_codename(default="Unknown"): result = re.match(r"^(.*) @.*$", cpu_model) if result: return CPU_DICT[CpuVendor.INTEL].get(result.group(1), default) + # Some Intel CPUs (e.g. Intel Sapphire Rapids) don't include "@ ". + return CPU_DICT[CpuVendor.INTEL].get(cpu_model, default) if vendor == CpuVendor.AMD: result = re.match(r"^(.*) [0-9]*-Core Processor$", cpu_model) if result: diff --git a/tests/integration_tests/functional/test_cpu_features_host_vs_guest.py b/tests/integration_tests/functional/test_cpu_features_host_vs_guest.py index c8075faa505..3388b1935a9 100644 --- a/tests/integration_tests/functional/test_cpu_features_host_vs_guest.py +++ b/tests/integration_tests/functional/test_cpu_features_host_vs_guest.py @@ -157,10 +157,12 @@ } -def test_host_vs_guest_cpu_features(uvm_nano): +def test_host_vs_guest_cpu_features(uvm_plain_any): """Check CPU features host vs guest""" - vm = uvm_nano + vm = uvm_plain_any + vm.spawn() + vm.basic_config() vm.add_net_iface() vm.start() host_feats = set(utils.check_output(CPU_FEATURES_CMD).stdout.split()) @@ -231,6 +233,10 @@ def test_host_vs_guest_cpu_features(uvm_nano): assert host_feats - guest_feats == host_guest_diff_6_1 assert guest_feats - host_feats == INTEL_GUEST_ONLY_FEATS - {"umip"} + case CpuModel.INTEL_SAPPHIRE_RAPIDS: + assert host_feats - guest_feats == INTEL_HOST_ONLY_FEATS + assert guest_feats - host_feats == INTEL_GUEST_ONLY_FEATS + case CpuModel.ARM_NEOVERSE_N1: expected_guest_minus_host = set() expected_host_minus_guest = set() diff --git a/tests/integration_tests/performance/test_snapshot_ab.py b/tests/integration_tests/performance/test_snapshot_ab.py index 23224de6b31..e28d3166617 100644 --- a/tests/integration_tests/performance/test_snapshot_ab.py +++ b/tests/integration_tests/performance/test_snapshot_ab.py @@ -10,6 +10,7 @@ import host_tools.drive as drive_tools from framework.microvm import Microvm +from framework.properties import global_props USEC_IN_MSEC = 1000 ITERATIONS = 30 @@ -124,6 +125,10 @@ def sample_latency( ], ids=lambda x: x.id, ) +@pytest.mark.skipif( + global_props.cpu_codename == "INTEL_SAPPHIRE_RAPIDS", + reason="Intel Sapphire Rapids to be supported soon in upcoming change", +) def test_restore_latency( microvm_factory, rootfs, guest_kernel_linux_5_10, test_setup, metrics ): diff --git a/tools/bindgen.sh b/tools/bindgen.sh index e2698b81daf..6c64bc5be93 100755 --- a/tools/bindgen.sh +++ b/tools/bindgen.sh @@ -154,6 +154,15 @@ fc-bindgen \ "amazonlinux-v5.10.y/include/uapi/linux/io_uring.h" \ >src/vmm/src/io_uring/gen.rs +# Latest upstream kernel +KERNEL_SRC_DIR="linux" +[ -d ${KERNEL_SRC_DIR} ] || git clone --depth 1 https://github.com/amazonlinux/linux ${KERNEL_SRC_DIR} + +info "BINDGEN asm/prctl.h" +fc-bindgen \ + --allowlist-var "ARCH_.*" \ + "${KERNEL_SRC_DIR}/arch/x86/include/uapi/asm/prctl.h" >src/vmm/src/arch/x86_64/gen/arch_prctl.rs + # Apply any patches info "Apply patches" for PATCH in $(dirname $0)/bindgen-patches/*.patch; do