@@ -6,7 +6,11 @@ use kvm_bindings::KVM_API_VERSION;
66use kvm_bindings:: { CpuId , MsrList , KVM_MAX_CPUID_ENTRIES } ;
77use kvm_ioctls:: Kvm as KvmFd ;
88use serde:: { Deserialize , Serialize } ;
9+ #[ cfg( target_arch = "x86_64" ) ]
10+ use vmm_sys_util:: syscall:: SyscallReturnCode ;
911
12+ #[ cfg( target_arch = "x86_64" ) ]
13+ use crate :: arch:: x86_64:: gen:: arch_prctl;
1014use crate :: cpu_config:: templates:: KvmCapability ;
1115use crate :: vstate:: memory:: { GuestMemory , GuestMemoryMmap } ;
1216
@@ -25,8 +29,14 @@ pub enum KvmError {
2529 #[ cfg( target_arch = "x86_64" ) ]
2630 /// Failed to get supported cpuid: {0}
2731 GetSupportedCpuId ( kvm_ioctls:: Error ) ,
32+ #[ cfg( target_arch = "x86_64" ) ]
33+ /// Failed to get supported XSTATE features: {0}
34+ GetSupportedXstateFeatures ( std:: io:: Error ) ,
2835 /// The number of configured slots is bigger than the maximum reported by KVM
2936 NotEnoughMemorySlots ,
37+ #[ cfg( target_arch = "x86_64" ) ]
38+ /// Failed to request permission for a XSTATE feature ({0}): {1}
39+ RequestXstateFeatures ( u32 , std:: io:: Error ) ,
3040}
3141
3242/// Struct with kvm fd and kvm associated paramenters.
@@ -73,6 +83,15 @@ impl Kvm {
7383
7484 #[ cfg( target_arch = "x86_64" ) ]
7585 {
86+ // Request permission for Intel AMX (Advanced Matrix Extensions) TILEDATA.
87+ //
88+ // Unless requested, on kernels prior to v6.4, KVM_GET_SUPPORTED_CPUID returns an
89+ // inconsistent state where TILECFG is set but TILEDATA isn't. Such a half-enabled state
90+ // causes guest crash during boot because a guest calls XSETBV instruction with all
91+ // XSAVE feature bits enumerated on CPUID and XSETBV only accepts either of both Intel
92+ // AMX bits enabled or disabled; otherwise resulting in general protection fault.
93+ Self :: request_xstate_feature_permission ( arch_prctl:: ARCH_XCOMP_TILEDATA ) ?;
94+
7695 let supported_cpuid = kvm_fd
7796 . get_supported_cpuid ( KVM_MAX_CPUID_ENTRIES )
7897 . map_err ( KvmError :: GetSupportedCpuId ) ?;
@@ -86,6 +105,65 @@ impl Kvm {
86105 }
87106 }
88107
108+ /// Request permission for a dynamic XSTATE features.
109+ ///
110+ /// Some XSTATE features are not permitted by default, because they require a larger area
111+ /// to save their states than the traditional 4096-byte area. Instead, the permission for them
112+ /// can be requested via arch_prctl().
113+ /// https://github.com/torvalds/linux/blob/master/Documentation/arch/x86/xstate.rst
114+ ///
115+ /// We request permission for them by default if available in order to retrieve the correct
116+ /// supported feature set via KVM_GET_SUPPORTED_CPUID.
117+ /// https://docs.kernel.org/virt/kvm/api.html#kvm-get-supported-cpuid
118+ ///
119+ /// Note that such requested features can be disabled by a CPU template and no memory allocation
120+ /// to save their states happens here immediately.
121+ #[ cfg( target_arch = "x86_64" ) ]
122+ fn request_xstate_feature_permission ( xfeature : u32 ) -> Result < ( ) , KvmError > {
123+ // Get the supported dynamic xstate features.
124+ let mut supported_xfeatures: libc:: c_ulong = 0 ;
125+ // SAFETY: Safe because the third input (`addr`) is a valid `c_ulong`` pointer.
126+ // https://man7.org/linux/man-pages/man2/arch_prctl.2.html
127+ SyscallReturnCode ( unsafe {
128+ libc:: syscall (
129+ libc:: SYS_arch_prctl ,
130+ arch_prctl:: ARCH_GET_XCOMP_SUPP ,
131+ & mut supported_xfeatures as * mut libc:: c_ulong ,
132+ )
133+ } )
134+ . into_empty_result ( )
135+ . or_else ( |err| {
136+ if err. raw_os_error ( ) == Some ( libc:: EINVAL ) {
137+ // EINVAL is returned if the dynamic XSTATE feature enabling is not supported (e.g.
138+ // kernel version prior to v5.17).
139+ // https://github.com/torvalds/linux/commit/980fe2fddcff21937c93532b4597c8ea450346c1
140+ //
141+ // `supported_xfeatures` remains 0 here, so will skip permission request.
142+ Ok ( ( ) )
143+ } else {
144+ Err ( err)
145+ }
146+ } )
147+ . map_err ( KvmError :: GetSupportedXstateFeatures ) ?;
148+
149+ // Request permission for the given XSTATE feature only if available
150+ let xfeature_mask: libc:: c_ulong = 1u64 << xfeature;
151+ if ( supported_xfeatures & xfeature_mask) == xfeature_mask {
152+ // SAFETY: Safe because all inputs are valid as `c_ulong`` values.
153+ SyscallReturnCode ( unsafe {
154+ libc:: syscall (
155+ libc:: SYS_arch_prctl ,
156+ arch_prctl:: ARCH_REQ_XCOMP_GUEST_PERM ,
157+ xfeature,
158+ )
159+ } )
160+ . into_empty_result ( )
161+ . map_err ( |err| KvmError :: RequestXstateFeatures ( xfeature, err) ) ?;
162+ }
163+
164+ Ok ( ( ) )
165+ }
166+
89167 /// Msrs needed to be saved on snapshot creation.
90168 #[ cfg( target_arch = "x86_64" ) ]
91169 pub fn msrs_to_save ( & self ) -> Result < MsrList , crate :: arch:: x86_64:: msr:: MsrError > {
@@ -215,4 +293,54 @@ pub(crate) mod tests {
215293 . iter( )
216294 . any( |c| * c == kvm_bindings:: KVM_CAP_IOEVENTFD ) ) ;
217295 }
296+
297+ #[ cfg( target_arch = "x86_64" ) ]
298+ #[ test]
299+ fn test_request_xstate_feature_permission ( ) {
300+ // Test request_xstate_feature_permission() for Intel AMX.
301+ Kvm :: request_xstate_feature_permission ( arch_prctl:: ARCH_XCOMP_TILEDATA ) . unwrap ( ) ;
302+
303+ let mut supported_xfeatures: libc:: c_ulong = 0 ;
304+ // SAFETY: Safe because the third input (`addr`) is a valid `c_ulong` pointer.
305+ match SyscallReturnCode ( unsafe {
306+ libc:: syscall (
307+ libc:: SYS_arch_prctl ,
308+ arch_prctl:: ARCH_GET_XCOMP_SUPP ,
309+ & mut supported_xfeatures as * mut libc:: c_ulong ,
310+ )
311+ } )
312+ . into_empty_result ( )
313+ {
314+ Ok ( _) => { } // Continue this test
315+ Err ( err) if err. raw_os_error ( ) == Some ( libc:: EINVAL ) => {
316+ // Dynamic XSTATE feature enabling is not supported in the first place, so nothing
317+ // to test on this kernel version.
318+ return ;
319+ }
320+ Err ( err) => panic ! ( "Unexpected error: {}" , err) ,
321+ } ;
322+
323+ // If Intel AMX is not supported, nothing to test on this CPU.
324+ let intel_amx_feature_mask: libc:: c_ulong = 1u64 << arch_prctl:: ARCH_XCOMP_TILEDATA ;
325+ if supported_xfeatures & intel_amx_feature_mask != intel_amx_feature_mask {
326+ return ;
327+ }
328+
329+ let mut permitted_xfeatures: libc:: c_ulong = 0 ;
330+ // SAFETY: Safe because the third input (`addr`) is a valid `c_ulong` pointer.
331+ SyscallReturnCode ( unsafe {
332+ libc:: syscall (
333+ libc:: SYS_arch_prctl ,
334+ arch_prctl:: ARCH_GET_XCOMP_GUEST_PERM ,
335+ & mut permitted_xfeatures as * mut libc:: c_ulong ,
336+ )
337+ } )
338+ . into_empty_result ( )
339+ . unwrap ( ) ;
340+
341+ assert_eq ! (
342+ permitted_xfeatures & intel_amx_feature_mask,
343+ intel_amx_feature_mask
344+ ) ;
345+ }
218346}
0 commit comments