firecracker-microvm · zulinx86 · Feb 28, 2025 · Feb 28, 2025 · Mar 3, 2025 · Mar 5, 2025
diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml
@@ -9,7 +9,7 @@ license = "Apache-2.0"
 bench = false
 
 [dependencies]
-acpi_tables = { path = "../acpi-tables" } 
+acpi_tables = { path = "../acpi-tables" }
 aes-gcm =  { version = "0.10.1", default-features = false, features = ["aes"] }
 arrayvec = { version = "0.7.6", optional = true }
 aws-lc-rs = { version = "1.12.4", features = ["bindgen"] }

diff --git a/src/vmm/src/arch/x86_64/gen/arch_prctl.rs b/src/vmm/src/arch/x86_64/gen/arch_prctl.rs
@@ -0,0 +1,43 @@
+// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+// automatically generated by tools/bindgen.sh
+
+#![allow(
+    non_camel_case_types,
+    non_upper_case_globals,
+    dead_code,
+    non_snake_case,
+    clippy::ptr_as_ptr,
+    clippy::undocumented_unsafe_blocks,
+    missing_debug_implementations,
+    clippy::tests_outside_test_module
+)]
+
+pub const ARCH_SET_GS: u32 = 4097;
+pub const ARCH_SET_FS: u32 = 4098;
+pub const ARCH_GET_FS: u32 = 4099;
+pub const ARCH_GET_GS: u32 = 4100;
+pub const ARCH_GET_CPUID: u32 = 4113;
+pub const ARCH_SET_CPUID: u32 = 4114;
+pub const ARCH_GET_XCOMP_SUPP: u32 = 4129;
+pub const ARCH_GET_XCOMP_PERM: u32 = 4130;
+pub const ARCH_REQ_XCOMP_PERM: u32 = 4131;
+pub const ARCH_GET_XCOMP_GUEST_PERM: u32 = 4132;
+pub const ARCH_REQ_XCOMP_GUEST_PERM: u32 = 4133;
+pub const ARCH_XCOMP_TILECFG: u32 = 17;
+pub const ARCH_XCOMP_TILEDATA: u32 = 18;
+pub const ARCH_MAP_VDSO_X32: u32 = 8193;
+pub const ARCH_MAP_VDSO_32: u32 = 8194;
+pub const ARCH_MAP_VDSO_64: u32 = 8195;
+pub const ARCH_GET_UNTAG_MASK: u32 = 16385;
+pub const ARCH_ENABLE_TAGGED_ADDR: u32 = 16386;
+pub const ARCH_GET_MAX_TAG_BITS: u32 = 16387;
+pub const ARCH_FORCE_TAGGED_SVA: u32 = 16388;
+pub const ARCH_SHSTK_ENABLE: u32 = 20481;
+pub const ARCH_SHSTK_DISABLE: u32 = 20482;
+pub const ARCH_SHSTK_LOCK: u32 = 20483;
+pub const ARCH_SHSTK_UNLOCK: u32 = 20484;
+pub const ARCH_SHSTK_STATUS: u32 = 20485;
+pub const ARCH_SHSTK_SHSTK: u32 = 1;
+pub const ARCH_SHSTK_WRSS: u32 = 2;
diff --git a/src/vmm/src/arch/x86_64/gen/mod.rs b/src/vmm/src/arch/x86_64/gen/mod.rs
@@ -5,9 +5,9 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the THIRD-PARTY file.
 
+pub mod arch_prctl;
 pub mod hyperv;
 pub mod hyperv_tlfs;
 pub mod mpspec;
-
 pub mod msr_index;
 pub mod perf_event;
diff --git a/src/vmm/src/vstate/kvm.rs b/src/vmm/src/vstate/kvm.rs
@@ -6,14 +6,18 @@
 use kvm_bindings::{CpuId, MsrList, KVM_MAX_CPUID_ENTRIES};
 use kvm_ioctls::Kvm as KvmFd;
 use serde::{Deserialize, Serialize};
+#[cfg(target_arch = "x86_64")]
+use vmm_sys_util::syscall::SyscallReturnCode;
 
+#[cfg(target_arch = "x86_64")]
+use crate::arch::x86_64::gen::arch_prctl;
 use crate::cpu_config::templates::KvmCapability;
 use crate::vstate::memory::{GuestMemory, GuestMemoryMmap};
 
 /// Errors associated with the wrappers over KVM ioctls.
 /// Needs `rustfmt::skip` to make multiline comments work
 #[rustfmt::skip]
-#[derive(Debug, PartialEq, Eq, thiserror::Error, displaydoc::Display)]
+#[derive(Debug, thiserror::Error, displaydoc::Display)]
 pub enum KvmError {
     /// The host kernel reports an invalid KVM API version: {0}
     ApiVersion(i32),
@@ -25,8 +29,14 @@
     #[cfg(target_arch = "x86_64")]
     /// Failed to get supported cpuid: {0}
     GetSupportedCpuId(kvm_ioctls::Error),
+    #[cfg(target_arch = "x86_64")]
+    /// Failed to get supported XSTATE features: {0}
+    GetSupportedXstateFeatures(std::io::Error),
     /// The number of configured slots is bigger than the maximum reported by KVM
     NotEnoughMemorySlots,
+    #[cfg(target_arch = "x86_64")]
+    /// Failed to enable XSTATE features ({0:#b}): {1}
+    RequestXstateFeatures(u32, std::io::Error),
 }
 
 /// Struct with kvm fd and kvm associated paramenters.
@@ -73,6 +83,8 @@
 
         #[cfg(target_arch = "x86_64")]
         {
+            Self::enable_intel_amx()?;
+
             let supported_cpuid = kvm_fd
                 .get_supported_cpuid(KVM_MAX_CPUID_ENTRIES)
                 .map_err(KvmError::GetSupportedCpuId)?;
@@ -86,6 +98,74 @@
         }
     }
 
+    #[cfg(target_arch = "x86_64")]
+    // XSTATE feature mask for Intel AMX.
+    const INTEL_AMX_XCOMP_MASK: libc::c_ulong =
+        (1u64 << arch_prctl::ARCH_XCOMP_TILECFG) | (1u64 << arch_prctl::ARCH_XCOMP_TILEDATA);
+
+    /// Enable Intel AMX if available.
+    ///
+    /// Intel AMX (Advanced Matrix Extensions) is an instruction set for AI workloads that was
+    /// introduced in Intel Sapphire Rapids (*7i.metal). Since it requires larger area to save the
+    /// state, it is disabled by default.
+    /// https://github.com/torvalds/linux/blob/master/Documentation/arch/x86/xstate.rst
+    ///
+    /// We enable it by default but can be disabled by CPU template; otherwise,
+    /// KVM_GET_SUPPORTED_CPUID returns a inconsistent state where TILECFG is enabled but TILEDATA
+    /// is disabled, causing guest's #GP fault on xsetbv due to the lack of sanity check.
+    /// https://lore.kernel.org/all/[email protected]/
+    ///
+    /// Dynamically-enabled feature bits need to be requested with arch_prctl() before calling
+    /// KVM_GET_SUPPORTED_CPUID. Feature bits that have not been requested are excluded from the
+    /// result of KVM_GET_SUPPORTED_CPUID.
+    /// https://docs.kernel.org/virt/kvm/api.html
+    ///
+    /// Note that no memory allocation to save Intel AMX state happens here immediately.
+    #[cfg(target_arch = "x86_64")]
+    fn enable_intel_amx() -> Result<(), KvmError> {
+        // Get the supported xstate features.
+        let mut supported_xfeatures: libc::c_ulong = 0;
+        // SAFETY: Safe because the second input (`op`) might not be valid for unsupported kernels
+        // but EINVAL is handled later, and the third input (`addr`) is a valid c_ulong pointer.
+        // https://man7.org/linux/man-pages/man2/arch_prctl.2.html
+        SyscallReturnCode(unsafe {
+            libc::syscall(
+                libc::SYS_arch_prctl,
+                arch_prctl::ARCH_GET_XCOMP_SUPP,
+                &mut supported_xfeatures as *mut libc::c_ulong,
+            )
+        })
+        .into_empty_result()
+        .or_else(|err| {
+            // EINVAL is returned if ARCH_GET_XCOMP_SUPP is not supported (e.g. kernel versions
+            // prior to v5.17).
+            // https://github.com/torvalds/linux/commit/980fe2fddcff21937c93532b4597c8ea450346c1
+            if err.raw_os_error() == Some(libc::EINVAL) {
+                Ok(())
+            } else {
+                Err(err)
+            }
+        })
+        .map_err(KvmError::GetSupportedXstateFeatures)?;
+
+        // Enable Intel AMX if supported.
+        if (supported_xfeatures & Self::INTEL_AMX_XCOMP_MASK) == Self::INTEL_AMX_XCOMP_MASK {
+            // SAFETY: Safe because ARCH_REQ_XCOMP_GUEST_PERM is supported if ARCH_GET_XCOMP_SUPP is
+            // supported and it has been confirmed that ARCH_XCOMP_TILEDATA is supported.
+            SyscallReturnCode(unsafe {
+                libc::syscall(
+                    libc::SYS_arch_prctl,
+                    arch_prctl::ARCH_REQ_XCOMP_GUEST_PERM,
+                    arch_prctl::ARCH_XCOMP_TILEDATA,
+                )
+            })
+            .into_empty_result()
+            .map_err(|err| KvmError::RequestXstateFeatures(arch_prctl::ARCH_XCOMP_TILEDATA, err))?;
+        }
+
+        Ok(())
+    }
+
     /// Msrs needed to be saved on snapshot creation.
     #[cfg(target_arch = "x86_64")]
     pub fn msrs_to_save(&self) -> Result<MsrList, crate::arch::x86_64::msr::MsrError> {
@@ -215,4 +295,99 @@
             .iter()
             .any(|c| *c == kvm_bindings::KVM_CAP_IOEVENTFD));
     }
+
+    #[cfg(target_arch = "x86_64")]
+    mod x86_64 {
+        use super::*;
+        use crate::arch::x86_64::cpu_model::CpuModel;
+
+        #[derive(PartialEq, PartialOrd)]
+        struct KernelVersion(u32, u32);
+
+        impl KernelVersion {
+            fn current() -> Self {
+                let version_str = std::fs::read_to_string("/proc/sys/kernel/osrelease").unwrap();
+                let mut parts = version_str.trim().split('.');
+
+                let major = parts.next().unwrap().parse::<u32>().unwrap();
+                let minor = parts.next().unwrap().parse::<u32>().unwrap();
+
+                KernelVersion(major, minor)
+            }
+        }
+
+        #[derive(PartialEq)]
+        enum Vendor {
+            Intel,
+            Amd,
+        }
+
+        impl Vendor {
+            fn new() -> Self {
+                let vendor_id = Self::get_vendor_id_str();
+                match vendor_id.as_str() {
+                    "GenuineIntel" => Vendor::Intel,
+                    "AuthenticAMD" => Vendor::Amd,
+                    _ => panic!("Unknown vendor_id: {}", vendor_id),
+                }
+            }
+
+            fn get_vendor_id_str() -> String {
+                let cpuinfo = std::fs::read_to_string("/proc/cpuinfo").unwrap();
+
+                for line in cpuinfo.lines() {
+                    if line.starts_with("vendor_id") {
+                        return line
+                            .split(':')
+                            .nth(1)
+                            .map(|s| s.trim().to_string())
+                            .unwrap();
+                    }
+                }
+                panic!("`vendor_id` not found in /proc/cpuinfo");
+            }
+        }
+
+        #[cfg(target_arch = "x86_64")]
+        #[test]
+        fn test_enable_intel_amx() {
+            Kvm::enable_intel_amx().unwrap();
+
+            // ARCH_{REQ,GET}_XCOMP_GUEST_PERM were added in kernel v5.17.
+            //  https://github.com/torvalds/linux/commit/980fe2fddcff21937c93532b4597c8ea450346c1
+            let supported_version = KernelVersion(5, 17);
+            let current_version = KernelVersion::current();
+
+            if current_version >= supported_version {
+                let mut permitted_xfeatures: libc::c_ulong = 0;
+                // SAFETY: Safe because the second input (`op`) should be valid on kernel v5.17+,
+                // and the third input (`addr`) is a valid `c_ulong` pointer.
+                SyscallReturnCode(unsafe {
+                    libc::syscall(
+                        libc::SYS_arch_prctl,
+                        arch_prctl::ARCH_GET_XCOMP_GUEST_PERM,
+                        &mut permitted_xfeatures as *mut libc::c_ulong,
+                    )
+                })
+                .into_empty_result()
+                .unwrap();
+
+                // Intel AMX is available only on Intel processors now.
+                let vendor = Vendor::new();
+
+                // Intel AMX is introduced in Intel Sapphire Rapids (CPUID.01H:EAX = 0x000806f8).
+                let supported_cpu = CpuModel::from(&0x000806f8);
+                let current_cpu = CpuModel::get_cpu_model();
+
+                if current_cpu >= supported_cpu && vendor == Vendor::Intel {
+                    assert_eq!(
+                        permitted_xfeatures & Kvm::INTEL_AMX_XCOMP_MASK,
+                        Kvm::INTEL_AMX_XCOMP_MASK
+                    );
+                } else {
+                    assert_eq!(permitted_xfeatures & Kvm::INTEL_AMX_XCOMP_MASK, 0);
+                }
+            }
+        }
+    }
 }
diff --git a/tests/framework/utils_cpuid.py b/tests/framework/utils_cpuid.py
@@ -32,6 +32,7 @@ class CpuModel(str, Enum):
     INTEL_SKYLAKE = "INTEL_SKYLAKE"
     INTEL_CASCADELAKE = "INTEL_CASCADELAKE"
     INTEL_ICELAKE = "INTEL_ICELAKE"
+    INTEL_SAPPHIRE_RAPIDS = "INTEL_SAPPHIRE_RAPIDS"
 
 
 CPU_DICT = {
@@ -40,6 +41,7 @@ class CpuModel(str, Enum):
         "Intel(R) Xeon(R) Platinum 8124M CPU": "INTEL_SKYLAKE",
         "Intel(R) Xeon(R) Platinum 8259CL CPU": "INTEL_CASCADELAKE",
         "Intel(R) Xeon(R) Platinum 8375C CPU": "INTEL_ICELAKE",
+        "Intel(R) Xeon(R) Platinum 8488C": "INTEL_SAPPHIRE_RAPIDS",
     },
     CpuVendor.AMD: {"AMD EPYC 7R13": "AMD_MILAN", "AMD EPYC 9R14": "AMD_GENOA"},
     CpuVendor.ARM: {
@@ -83,6 +85,8 @@ def get_cpu_codename(default="Unknown"):
         result = re.match(r"^(.*) @.*$", cpu_model)
         if result:
             return CPU_DICT[CpuVendor.INTEL].get(result.group(1), default)
+        # Some Intel CPUs (e.g. Intel Sapphire Rapids) don't include "@ <frequency>".
+        return CPU_DICT[CpuVendor.INTEL].get(cpu_model, default)
     if vendor == CpuVendor.AMD:
         result = re.match(r"^(.*) [0-9]*-Core Processor$", cpu_model)
         if result:

diff --git a/tests/integration_tests/functional/test_cpu_features_host_vs_guest.py b/tests/integration_tests/functional/test_cpu_features_host_vs_guest.py
@@ -157,10 +157,12 @@
 }
 
 
-def test_host_vs_guest_cpu_features(uvm_nano):
+def test_host_vs_guest_cpu_features(uvm_plain_any):
     """Check CPU features host vs guest"""
 
-    vm = uvm_nano
+    vm = uvm_plain_any
+    vm.spawn()
+    vm.basic_config()
     vm.add_net_iface()
     vm.start()
     host_feats = set(utils.check_output(CPU_FEATURES_CMD).stdout.split())
@@ -231,6 +233,10 @@ def test_host_vs_guest_cpu_features(uvm_nano):
                 assert host_feats - guest_feats == host_guest_diff_6_1
             assert guest_feats - host_feats == INTEL_GUEST_ONLY_FEATS - {"umip"}
 
+        case CpuModel.INTEL_SAPPHIRE_RAPIDS:
+            assert host_feats - guest_feats == INTEL_HOST_ONLY_FEATS
+            assert guest_feats - host_feats == INTEL_GUEST_ONLY_FEATS
+
         case CpuModel.ARM_NEOVERSE_N1:
             expected_guest_minus_host = set()
             expected_host_minus_guest = set()

diff --git a/tests/integration_tests/performance/test_snapshot_ab.py b/tests/integration_tests/performance/test_snapshot_ab.py
@@ -10,6 +10,7 @@
 
 import host_tools.drive as drive_tools
 from framework.microvm import Microvm
+from framework.properties import global_props
 
 USEC_IN_MSEC = 1000
 ITERATIONS = 30
@@ -124,6 +125,10 @@ def sample_latency(
     ],
     ids=lambda x: x.id,
 )
+@pytest.mark.skipif(
+    global_props.cpu_codename == "INTEL_SAPPHIRE_RAPIDS",
+    reason="Intel Sapphire Rapids to be supported soon in upcoming change",
+)
 def test_restore_latency(
     microvm_factory, rootfs, guest_kernel_linux_5_10, test_setup, metrics
 ):

diff --git a/tools/bindgen.sh b/tools/bindgen.sh
@@ -154,6 +154,15 @@ fc-bindgen \
     "amazonlinux-v5.10.y/include/uapi/linux/io_uring.h" \
     >src/vmm/src/io_uring/gen.rs
 
+# Latest upstream kernel
+KERNEL_SRC_DIR="linux"
+[ -d ${KERNEL_SRC_DIR} ] || git clone --depth 1 https://github.com/amazonlinux/linux ${KERNEL_SRC_DIR}
+
+info "BINDGEN asm/prctl.h"
+fc-bindgen \
+    --allowlist-var "ARCH_.*" \
+    "${KERNEL_SRC_DIR}/arch/x86/include/uapi/asm/prctl.h" >src/vmm/src/arch/x86_64/gen/arch_prctl.rs
+
 # Apply any patches
 info "Apply patches"
 for PATCH in $(dirname $0)/bindgen-patches/*.patch; do