Skip to content

Commit 87e0cdd

Browse files
cpercivaaljimenezb
authored andcommitted
pvh/arch-x86_64: Initialize vCPU regs for PVH
Set the initial values of the KVM vCPU registers as specified in the PVH boot ABI: https://xenbits.xen.org/docs/unstable/misc/pvh.html Add stub bits for aarch64; PVH mode does not exist there. Signed-off-by: Colin Percival <[email protected]> Co-authored-by: Alejandro Jimenez <[email protected]>
1 parent b4d1e3b commit 87e0cdd

File tree

7 files changed

+232
-74
lines changed

7 files changed

+232
-74
lines changed

src/vmm/src/arch/x86_64/gdt.rs

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
// Copyright © 2020, Oracle and/or its affiliates.
2+
//
13
// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
24
// SPDX-License-Identifier: Apache-2.0
35
//
@@ -24,8 +26,38 @@ fn get_base(entry: u64) -> u64 {
2426
| (((entry) & 0x0000_0000_FFFF_0000) >> 16)
2527
}
2628

29+
// Extract the segment limit from the GDT segment descriptor.
30+
//
31+
// In a segment descriptor, the limit field is 20 bits, so it can directly describe
32+
// a range from 0 to 0xFFFFF (1 MB). When G flag is set (4-KByte page granularity) it
33+
// scales the value in the limit field by a factor of 2^12 (4 Kbytes), making the effective
34+
// limit range from 0xFFF (4 KBytes) to 0xFFFF_FFFF (4 GBytes).
35+
//
36+
// However, the limit field in the VMCS definition is a 32 bit field, and the limit value is not
37+
// automatically scaled using the G flag. This means that for a desired range of 4GB for a
38+
// given segment, its limit must be specified as 0xFFFF_FFFF. Therefore the method of obtaining
39+
// the limit from the GDT entry is not sufficient, since it only provides 20 bits when 32 bits
40+
// are necessary. Fortunately, we can check if the G flag is set when extracting the limit since
41+
// the full GDT entry is passed as an argument, and perform the scaling of the limit value to
42+
// return the full 32 bit value.
43+
//
44+
// The scaling mentioned above is required when using PVH boot, since the guest boots in protected
45+
// (32-bit) mode and must be able to access the entire 32-bit address space. It does not cause
46+
// issues for the case of direct boot to 64-bit (long) mode, since in 64-bit mode the processor does
47+
// not perform runtime limit checking on code or data segments.
48+
//
49+
// (For more information concerning the formats of segment descriptors, VMCS fields, et cetera,
50+
// please consult the Intel Software Developer Manual.)
2751
fn get_limit(entry: u64) -> u32 {
28-
((((entry) & 0x000F_0000_0000_0000) >> 32) as u32) | (((entry) & 0x0000_0000_0000_FFFF) as u32)
52+
#[allow(clippy::cast_possible_truncation)] // clearly, truncation is not possible
53+
let limit: u32 =
54+
((((entry) & 0x000F_0000_0000_0000) >> 32) | ((entry) & 0x0000_0000_0000_FFFF)) as u32;
55+
56+
// Perform manual limit scaling if G flag is set
57+
match get_g(entry) {
58+
0 => limit,
59+
_ => (limit << 12) | 0xFFF, // G flag is either 0 or 1
60+
}
2961
}
3062

3163
fn get_g(entry: u64) -> u8 {
@@ -109,7 +141,7 @@ mod tests {
109141
assert_eq!(0xB, seg.type_);
110142
// base and limit
111143
assert_eq!(0x10_0000, seg.base);
112-
assert_eq!(0xfffff, seg.limit);
144+
assert_eq!(0xffff_ffff, seg.limit);
113145
assert_eq!(0x0, seg.unusable);
114146
}
115147
}

src/vmm/src/arch/x86_64/layout.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@ pub const IRQ_MAX: u32 = 23;
2727
/// Address for the TSS setup.
2828
pub const KVM_TSS_ADDRESS: u64 = 0xfffb_d000;
2929

30+
/// Address of the hvm_start_info struct used in PVH boot
31+
pub const PVH_INFO_START: u64 = 0x6000;
32+
3033
/// The 'zero page', a.k.a linux kernel bootparams.
3134
pub const ZERO_PAGE_START: u64 = 0x7000;
3235

src/vmm/src/arch/x86_64/regs.rs

Lines changed: 127 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
// Copyright © 2020, Oracle and/or its affiliates.
12
// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
23
// SPDX-License-Identifier: Apache-2.0
34
//
@@ -10,6 +11,7 @@ use std::mem;
1011
use kvm_bindings::{kvm_fpu, kvm_regs, kvm_sregs};
1112
use kvm_ioctls::VcpuFd;
1213

14+
use super::super::{BootProtocol, EntryPoint};
1315
use super::gdt::{gdt_entry, kvm_segment_from_gdt};
1416
use crate::vstate::memory::{Address, Bytes, GuestAddress, GuestMemory, GuestMemoryMmap};
1517

@@ -80,20 +82,30 @@ pub struct SetupRegistersError(vmm_sys_util::errno::Error);
8082
/// # Errors
8183
///
8284
/// When [`kvm_ioctls::ioctls::vcpu::VcpuFd::set_regs`] errors.
83-
pub fn setup_regs(vcpu: &VcpuFd, boot_ip: u64) -> Result<(), SetupRegistersError> {
84-
let regs: kvm_regs = kvm_regs {
85-
rflags: 0x0000_0000_0000_0002u64,
86-
rip: boot_ip,
87-
// Frame pointer. It gets a snapshot of the stack pointer (rsp) so that when adjustments are
88-
// made to rsp (i.e. reserving space for local variables or pushing values on to the stack),
89-
// local variables and function parameters are still accessible from a constant offset from
90-
// rbp.
91-
rsp: super::layout::BOOT_STACK_POINTER,
92-
// Starting stack pointer.
93-
rbp: super::layout::BOOT_STACK_POINTER,
94-
// Must point to zero page address per Linux ABI. This is x86_64 specific.
95-
rsi: super::layout::ZERO_PAGE_START,
96-
..Default::default()
85+
pub fn setup_regs(vcpu: &VcpuFd, entry_point: EntryPoint) -> Result<(), SetupRegistersError> {
86+
let regs: kvm_regs = match entry_point.protocol {
87+
BootProtocol::PvhBoot => kvm_regs {
88+
// Configure regs as required by PVH boot protocol.
89+
rflags: 0x0000_0000_0000_0002u64,
90+
rbx: super::layout::PVH_INFO_START,
91+
rip: entry_point.entry_addr.raw_value(),
92+
..Default::default()
93+
},
94+
BootProtocol::LinuxBoot => kvm_regs {
95+
// Configure regs as required by Linux 64-bit boot protocol.
96+
rflags: 0x0000_0000_0000_0002u64,
97+
rip: entry_point.entry_addr.raw_value(),
98+
// Frame pointer. It gets a snapshot of the stack pointer (rsp) so that when adjustments
99+
// are made to rsp (i.e. reserving space for local variables or pushing
100+
// values on to the stack), local variables and function parameters are
101+
// still accessible from a constant offset from rbp.
102+
rsp: super::layout::BOOT_STACK_POINTER,
103+
// Starting stack pointer.
104+
rbp: super::layout::BOOT_STACK_POINTER,
105+
// Must point to zero page address per Linux ABI. This is x86_64 specific.
106+
rsi: super::layout::ZERO_PAGE_START,
107+
..Default::default()
108+
},
97109
};
98110

99111
vcpu.set_regs(&regs).map_err(SetupRegistersError)
@@ -118,6 +130,7 @@ pub enum SetupSpecialRegistersError {
118130
///
119131
/// * `mem` - The memory that will be passed to the guest.
120132
/// * `vcpu` - Structure for the VCPU that holds the VCPU's fd.
133+
/// * `boot_prot` - The boot protocol being used.
121134
///
122135
/// # Errors
123136
///
@@ -126,14 +139,21 @@ pub enum SetupSpecialRegistersError {
126139
/// - [`configure_segments_and_sregs`] errors.
127140
/// - [`setup_page_tables`] errors
128141
/// - [`kvm_ioctls::ioctls::vcpu::VcpuFd::set_sregs`] errors.
129-
pub fn setup_sregs(mem: &GuestMemoryMmap, vcpu: &VcpuFd) -> Result<(), SetupSpecialRegistersError> {
142+
pub fn setup_sregs(
143+
mem: &GuestMemoryMmap,
144+
vcpu: &VcpuFd,
145+
boot_prot: BootProtocol,
146+
) -> Result<(), SetupSpecialRegistersError> {
130147
let mut sregs: kvm_sregs = vcpu
131148
.get_sregs()
132149
.map_err(SetupSpecialRegistersError::GetSpecialRegisters)?;
133150

134-
configure_segments_and_sregs(mem, &mut sregs)
151+
configure_segments_and_sregs(mem, &mut sregs, boot_prot)
135152
.map_err(SetupSpecialRegistersError::ConfigureSegmentsAndSpecialRegisters)?;
136-
setup_page_tables(mem, &mut sregs).map_err(SetupSpecialRegistersError::SetupPageTables)?; // TODO(dgreid) - Can this be done once per system instead?
153+
if let BootProtocol::LinuxBoot = boot_prot {
154+
setup_page_tables(mem, &mut sregs).map_err(SetupSpecialRegistersError::SetupPageTables)?;
155+
// TODO(dgreid) - Can this be done once per system instead?
156+
}
137157

138158
vcpu.set_sregs(&sregs)
139159
.map_err(SetupSpecialRegistersError::SetSpecialRegisters)
@@ -148,6 +168,7 @@ const EFER_LMA: u64 = 0x400;
148168
const EFER_LME: u64 = 0x100;
149169

150170
const X86_CR0_PE: u64 = 0x1;
171+
const X86_CR0_ET: u64 = 0x10;
151172
const X86_CR0_PG: u64 = 0x8000_0000;
152173
const X86_CR4_PAE: u64 = 0x20;
153174

@@ -174,13 +195,28 @@ fn write_idt_value(val: u64, guest_mem: &GuestMemoryMmap) -> Result<(), RegsErro
174195
fn configure_segments_and_sregs(
175196
mem: &GuestMemoryMmap,
176197
sregs: &mut kvm_sregs,
198+
boot_prot: BootProtocol,
177199
) -> Result<(), RegsError> {
178-
let gdt_table: [u64; BOOT_GDT_MAX] = [
179-
gdt_entry(0, 0, 0), // NULL
180-
gdt_entry(0xa09b, 0, 0xfffff), // CODE
181-
gdt_entry(0xc093, 0, 0xfffff), // DATA
182-
gdt_entry(0x808b, 0, 0xfffff), // TSS
183-
];
200+
let gdt_table: [u64; BOOT_GDT_MAX] = match boot_prot {
201+
BootProtocol::PvhBoot => {
202+
// Configure GDT entries as specified by PVH boot protocol
203+
[
204+
gdt_entry(0, 0, 0), // NULL
205+
gdt_entry(0xc09b, 0, 0xffff_ffff), // CODE
206+
gdt_entry(0xc093, 0, 0xffff_ffff), // DATA
207+
gdt_entry(0x008b, 0, 0x67), // TSS
208+
]
209+
}
210+
BootProtocol::LinuxBoot => {
211+
// Configure GDT entries as specified by Linux 64bit boot protocol
212+
[
213+
gdt_entry(0, 0, 0), // NULL
214+
gdt_entry(0xa09b, 0, 0xfffff), // CODE
215+
gdt_entry(0xc093, 0, 0xfffff), // DATA
216+
gdt_entry(0x808b, 0, 0xfffff), // TSS
217+
]
218+
}
219+
};
184220

185221
let code_seg = kvm_segment_from_gdt(gdt_table[1], 1);
186222
let data_seg = kvm_segment_from_gdt(gdt_table[2], 2);
@@ -203,9 +239,17 @@ fn configure_segments_and_sregs(
203239
sregs.ss = data_seg;
204240
sregs.tr = tss_seg;
205241

206-
// 64-bit protected mode
207-
sregs.cr0 |= X86_CR0_PE;
208-
sregs.efer |= EFER_LME | EFER_LMA;
242+
match boot_prot {
243+
BootProtocol::PvhBoot => {
244+
sregs.cr0 = X86_CR0_PE | X86_CR0_ET;
245+
sregs.cr4 = 0;
246+
}
247+
BootProtocol::LinuxBoot => {
248+
// 64-bit protected mode
249+
sregs.cr0 |= X86_CR0_PE;
250+
sregs.efer |= EFER_LME | EFER_LMA;
251+
}
252+
}
209253

210254
Ok(())
211255
}
@@ -251,24 +295,45 @@ mod tests {
251295
gm.read_obj(read_addr).unwrap()
252296
}
253297

254-
fn validate_segments_and_sregs(gm: &GuestMemoryMmap, sregs: &kvm_sregs) {
298+
fn validate_segments_and_sregs(
299+
gm: &GuestMemoryMmap,
300+
sregs: &kvm_sregs,
301+
boot_prot: BootProtocol,
302+
) {
303+
if let BootProtocol::LinuxBoot = boot_prot {
304+
assert_eq!(0xaf_9b00_0000_ffff, read_u64(gm, BOOT_GDT_OFFSET + 8));
305+
assert_eq!(0xcf_9300_0000_ffff, read_u64(gm, BOOT_GDT_OFFSET + 16));
306+
assert_eq!(0x8f_8b00_0000_ffff, read_u64(gm, BOOT_GDT_OFFSET + 24));
307+
308+
assert_eq!(0xffff_ffff, sregs.tr.limit);
309+
310+
assert!(sregs.cr0 & X86_CR0_PE != 0);
311+
assert!(sregs.efer & EFER_LME != 0 && sregs.efer & EFER_LMA != 0);
312+
} else {
313+
// Validate values that are specific to PVH boot protocol
314+
assert_eq!(0xcf_9b00_0000_ffff, read_u64(gm, BOOT_GDT_OFFSET + 8));
315+
assert_eq!(0xcf_9300_0000_ffff, read_u64(gm, BOOT_GDT_OFFSET + 16));
316+
assert_eq!(0x00_8b00_0000_0067, read_u64(gm, BOOT_GDT_OFFSET + 24));
317+
318+
assert_eq!(0x67, sregs.tr.limit);
319+
assert_eq!(0, sregs.tr.g);
320+
321+
assert!(sregs.cr0 & X86_CR0_PE != 0 && sregs.cr0 & X86_CR0_ET != 0);
322+
assert_eq!(0, sregs.cr4);
323+
}
324+
325+
// Common settings for both PVH and Linux boot protocol
255326
assert_eq!(0x0, read_u64(gm, BOOT_GDT_OFFSET));
256-
assert_eq!(0xaf_9b00_0000_ffff, read_u64(gm, BOOT_GDT_OFFSET + 8));
257-
assert_eq!(0xcf_9300_0000_ffff, read_u64(gm, BOOT_GDT_OFFSET + 16));
258-
assert_eq!(0x8f_8b00_0000_ffff, read_u64(gm, BOOT_GDT_OFFSET + 24));
259327
assert_eq!(0x0, read_u64(gm, BOOT_IDT_OFFSET));
260328

261329
assert_eq!(0, sregs.cs.base);
262-
assert_eq!(0xfffff, sregs.ds.limit);
330+
assert_eq!(0xffff_ffff, sregs.ds.limit);
263331
assert_eq!(0x10, sregs.es.selector);
264332
assert_eq!(1, sregs.fs.present);
265333
assert_eq!(1, sregs.gs.g);
266334
assert_eq!(0, sregs.ss.avl);
267335
assert_eq!(0, sregs.tr.base);
268-
assert_eq!(0xfffff, sregs.tr.limit);
269336
assert_eq!(0, sregs.tr.avl);
270-
assert!(sregs.cr0 & X86_CR0_PE != 0);
271-
assert!(sregs.efer & EFER_LME != 0 && sregs.efer & EFER_LMA != 0);
272337
}
273338

274339
fn validate_page_tables(gm: &GuestMemoryMmap, sregs: &kvm_sregs) {
@@ -320,7 +385,12 @@ mod tests {
320385
..Default::default()
321386
};
322387

323-
setup_regs(&vcpu, expected_regs.rip).unwrap();
388+
let entry_point: EntryPoint = EntryPoint {
389+
entry_addr: GuestAddress(expected_regs.rip),
390+
protocol: BootProtocol::LinuxBoot,
391+
};
392+
393+
setup_regs(&vcpu, entry_point).unwrap();
324394

325395
let actual_regs: kvm_regs = vcpu.get_regs().unwrap();
326396
assert_eq!(actual_regs, expected_regs);
@@ -333,16 +403,22 @@ mod tests {
333403
let vcpu = vm.create_vcpu(0).unwrap();
334404
let gm = single_region_mem(0x10000);
335405

336-
vcpu.set_sregs(&Default::default()).unwrap();
337-
setup_sregs(&gm, &vcpu).unwrap();
338-
339-
let mut sregs: kvm_sregs = vcpu.get_sregs().unwrap();
340-
// for AMD KVM_GET_SREGS returns g = 0 for each kvm_segment.
341-
// We set it to 1, otherwise the test will fail.
342-
sregs.gs.g = 1;
343-
344-
validate_segments_and_sregs(&gm, &sregs);
345-
validate_page_tables(&gm, &sregs);
406+
[BootProtocol::LinuxBoot, BootProtocol::PvhBoot]
407+
.iter()
408+
.for_each(|boot_prot| {
409+
vcpu.set_sregs(&Default::default()).unwrap();
410+
setup_sregs(&gm, &vcpu, *boot_prot).unwrap();
411+
412+
let mut sregs: kvm_sregs = vcpu.get_sregs().unwrap();
413+
// for AMD KVM_GET_SREGS returns g = 0 for each kvm_segment.
414+
// We set it to 1, otherwise the test will fail.
415+
sregs.gs.g = 1;
416+
417+
validate_segments_and_sregs(&gm, &sregs, *boot_prot);
418+
if let BootProtocol::LinuxBoot = *boot_prot {
419+
validate_page_tables(&gm, &sregs);
420+
}
421+
});
346422
}
347423

348424
#[test]
@@ -386,9 +462,13 @@ mod tests {
386462
fn test_configure_segments_and_sregs() {
387463
let mut sregs: kvm_sregs = Default::default();
388464
let gm = single_region_mem(0x10000);
389-
configure_segments_and_sregs(&gm, &mut sregs).unwrap();
465+
configure_segments_and_sregs(&gm, &mut sregs, BootProtocol::LinuxBoot).unwrap();
466+
467+
validate_segments_and_sregs(&gm, &sregs, BootProtocol::LinuxBoot);
468+
469+
configure_segments_and_sregs(&gm, &mut sregs, BootProtocol::PvhBoot).unwrap();
390470

391-
validate_segments_and_sregs(&gm, &sregs);
471+
validate_segments_and_sregs(&gm, &sregs, BootProtocol::PvhBoot);
392472
}
393473

394474
#[test]

src/vmm/src/builder.rs

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -332,7 +332,7 @@ pub fn build_microvm_for_boot(
332332
vcpus.as_mut(),
333333
&vm_resources.machine_config,
334334
&cpu_template,
335-
entry_point.entry_addr,
335+
entry_point,
336336
&initrd,
337337
boot_cmdline,
338338
)?;
@@ -567,6 +567,7 @@ pub fn build_microvm_from_snapshot(
567567
Ok(vmm)
568568
}
569569

570+
#[cfg(target_arch = "x86_64")]
570571
fn load_kernel(
571572
boot_config: &BootConfig,
572573
guest_memory: &GuestMemoryMmap,
@@ -751,7 +752,7 @@ pub fn configure_system_for_boot(
751752
vcpus: &mut [Vcpu],
752753
machine_config: &MachineConfig,
753754
cpu_template: &CustomCpuTemplate,
754-
entry_addr: GuestAddress,
755+
entry_point: EntryPoint,
755756
initrd: &Option<InitrdConfig>,
756757
boot_cmdline: LoaderKernelCmdline,
757758
) -> Result<(), StartMicrovmError> {
@@ -802,7 +803,7 @@ pub fn configure_system_for_boot(
802803
// Configure vCPUs with normalizing and setting the generated CPU configuration.
803804
for vcpu in vcpus.iter_mut() {
804805
vcpu.kvm_vcpu
805-
.configure(vmm.guest_memory(), entry_addr, &vcpu_config)
806+
.configure(vmm.guest_memory(), entry_point, &vcpu_config)
806807
.map_err(VmmError::VcpuConfigure)
807808
.map_err(Internal)?;
808809
}
@@ -847,7 +848,7 @@ pub fn configure_system_for_boot(
847848
vcpu.kvm_vcpu
848849
.configure(
849850
vmm.guest_memory(),
850-
entry_addr,
851+
entry_point,
851852
&vcpu_config,
852853
&optional_capabilities,
853854
)

0 commit comments

Comments
 (0)