Skip to content

Commit 55dd66d

Browse files
author
Vineeth Pillai (Google)
committed
REORDER: pKVM: VMX: CPU reprivilege functionality
Reprivilege feature allows host to boot as bare metal in case of pKVM initialization failure. All deprivileged cpus(VMX non-root mode) will be transitioned back to VMX root mode and then to non-VMX mode so that host can boot without pKVM. Signed-off-by: Vineeth Pillai (Google) <vineeth@bitbyteword.org>
1 parent 29f3759 commit 55dd66d

File tree

5 files changed

+213
-2
lines changed

5 files changed

+213
-2
lines changed

arch/x86/kvm/pkvm/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ pkvm-hyp-y += $(kernel-lib)/sort.o $(kernel-lib)/bsearch.o \
3939
kvm := ..
4040
pkvm-hyp-y += $(kvm)/x86.o $(kvm)/cpuid.o
4141

42-
pkvm-hyp-$(CONFIG_PKVM_INTEL) += vmx/host_vmentry.o vmx/host_vmx.o \
42+
pkvm-hyp-$(CONFIG_PKVM_INTEL) += vmx/host_vmentry.o vmx/host_vmx.o vmx/host_repriv.o \
4343
$(kvm)/vmx/vmx.o vmx/idt.o vmx/ept.o \
4444
$(kvm)/vmx/main.o
4545

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
// SPDX-License-Identifier: GPL-2.0
2+
/*
3+
* Copyright (C) 2025 Google
4+
*/
5+
6+
#include <vmx/vmx.h>
7+
#include "host_vmx.h"
8+
#include "vcpu_regs.h"
9+
10+
struct host_cpu_state {
11+
unsigned long cr0, cr3, cr4;
12+
unsigned long rip, rsp;
13+
unsigned long rflags;
14+
unsigned long fsbase, gsbase;
15+
unsigned long long debugctl, perf_global_ctrl;
16+
unsigned long long sysenter_cs, sysenter_esp, sysenter_eip;
17+
unsigned long long efer, cr_pat;
18+
unsigned short cs, ds, es, fs, gs, ss;
19+
20+
struct desc_ptr gdt, idt;
21+
};
22+
23+
static inline void read_host_cpu_state(struct host_cpu_state *hcs)
24+
{
25+
hcs->rsp = vmcs_readl(GUEST_RSP);
26+
hcs->rip = vmcs_readl(GUEST_RIP) + vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
27+
hcs->rflags = vmcs_readl(GUEST_RFLAGS);
28+
29+
hcs->ds = vmcs_read16(GUEST_DS_SELECTOR);
30+
hcs->es = vmcs_read16(GUEST_ES_SELECTOR);
31+
hcs->fs = vmcs_read16(GUEST_FS_SELECTOR);
32+
hcs->gs = vmcs_read16(GUEST_GS_SELECTOR);
33+
hcs->ss = vmcs_read16(GUEST_SS_SELECTOR);
34+
hcs->cs = vmcs_read16(GUEST_CS_SELECTOR);
35+
36+
hcs->fsbase = vmcs_readl(GUEST_FS_BASE);
37+
hcs->gsbase = vmcs_readl(GUEST_GS_BASE);
38+
39+
hcs->gdt.address = vmcs_readl(GUEST_GDTR_BASE);
40+
hcs->gdt.size = vmcs_read32(GUEST_GDTR_LIMIT);
41+
hcs->idt.address = vmcs_readl(GUEST_IDTR_BASE);
42+
hcs->idt.size = vmcs_read32(GUEST_IDTR_LIMIT);
43+
44+
hcs->debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
45+
hcs->perf_global_ctrl = vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL);
46+
hcs->sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
47+
hcs->sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
48+
hcs->sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
49+
hcs->efer = vmcs_read64(GUEST_IA32_EFER);
50+
hcs->cr_pat = vmcs_read64(GUEST_IA32_PAT);
51+
52+
hcs->cr0 = vmcs_readl(GUEST_CR0);
53+
hcs->cr3 = vmcs_readl(GUEST_CR3);
54+
hcs->cr4 = vmcs_readl(GUEST_CR4);
55+
}
56+
57+
#define PKVM_WRITE_CR(crnum, val) \
58+
static inline void __pkvm_write_cr##crnum(unsigned long val) \
59+
{ \
60+
asm volatile("mov %0,%%cr" #crnum : "+r" (val) : : "memory"); \
61+
}
62+
63+
PKVM_WRITE_CR(0, val)
64+
PKVM_WRITE_CR(3, val)
65+
PKVM_WRITE_CR(4, val)
66+
67+
/*
68+
* Restores register state from memory pointed by rdi
69+
* offset: offset of register backup in memory
70+
* dest_reg: register to be restored.
71+
*/
72+
#define STRINGIFY(x) #x
73+
#define RESTORE_VCPU_REG(offset, dest_reg) \
74+
"mov " STRINGIFY(offset) "(%%rdi), %%" #dest_reg "\n"
75+
76+
77+
static inline void restore_host_special_regs(struct host_cpu_state *hcs)
78+
{
79+
struct desc_struct *gdt_desc;
80+
tss_desc *tss;
81+
82+
/* Reset the busy bit to reload TR */
83+
gdt_desc = (struct desc_struct *)(hcs->gdt.address);
84+
tss = (tss_desc *)&gdt_desc[GDT_ENTRY_TSS];
85+
tss->type = DESC_TSS;
86+
87+
__pkvm_write_cr4(hcs->cr4);
88+
__pkvm_write_cr0(hcs->cr0);
89+
__pkvm_write_cr3(hcs->cr3);
90+
91+
wrmsrq_safe(MSR_CORE_PERF_GLOBAL_CTRL, hcs->perf_global_ctrl);
92+
wrmsrq(MSR_IA32_DEBUGCTLMSR, hcs->debugctl);
93+
wrmsrq(MSR_IA32_SYSENTER_CS, hcs->sysenter_cs);
94+
wrmsrq(MSR_IA32_SYSENTER_ESP, hcs->sysenter_esp);
95+
wrmsrq(MSR_IA32_SYSENTER_EIP, hcs->sysenter_eip);
96+
wrmsrq(MSR_IA32_CR_PAT, hcs->cr_pat);
97+
wrmsrq(MSR_EFER, hcs->efer);
98+
99+
asm volatile (
100+
"lgdt %0\n"
101+
"lidt %1\n"
102+
"ltr %w2\n"
103+
"mov %3, %%ds\n"
104+
"mov %4, %%es\n"
105+
"mov %5, %%fs\n"
106+
"mov %6, %%gs\n"
107+
108+
:
109+
: "m"(hcs->gdt), "m"(hcs->idt), "q"(GDT_ENTRY_TSS*8),
110+
"m"(hcs->ds), "m"(hcs->es), "m"(hcs->fs), "m"(hcs->gs)
111+
: "memory"
112+
);
113+
114+
wrmsrl(MSR_FS_BASE, hcs->fsbase);
115+
wrmsrl(MSR_GS_BASE, hcs->gsbase);
116+
}
117+
118+
/* Restores host cpu state and returns to host in VMX root mode. */
119+
void pkvm_vmx_reprivilege_cpu(unsigned long *vcpu_regs)
120+
{
121+
static struct host_cpu_state hcs;
122+
123+
read_host_cpu_state(&hcs);
124+
restore_host_special_regs(&hcs);
125+
126+
asm volatile(
127+
/* Update stack as expected by iretq */
128+
"pushq %0\n"
129+
"pushq %1\n"
130+
"pushq %2\n"
131+
"pushq %3\n"
132+
"pushq %4\n"
133+
134+
/* Restore general purpose registers */
135+
RESTORE_VCPU_REG(VCPU_RCX, rcx)
136+
RESTORE_VCPU_REG(VCPU_RDX, rdx)
137+
RESTORE_VCPU_REG(VCPU_RBX, rbx)
138+
RESTORE_VCPU_REG(VCPU_RBP, rbp)
139+
RESTORE_VCPU_REG(VCPU_RSI, rsi)
140+
RESTORE_VCPU_REG(VCPU_R8, r8)
141+
RESTORE_VCPU_REG(VCPU_R9, r9)
142+
RESTORE_VCPU_REG(VCPU_R10, r10)
143+
RESTORE_VCPU_REG(VCPU_R11, r11)
144+
RESTORE_VCPU_REG(VCPU_R12, r12)
145+
RESTORE_VCPU_REG(VCPU_R13, r13)
146+
RESTORE_VCPU_REG(VCPU_R14, r14)
147+
RESTORE_VCPU_REG(VCPU_R15, r15)
148+
149+
/* Restore RDI (last!) */
150+
RESTORE_VCPU_REG(VCPU_RDI, rdi)
151+
152+
/*
153+
* We are not technically returning from the hypercall, but set
154+
* RAX to zero to indicate to host that reprivilege succeeded.
155+
*/
156+
"xor %%rax, %%rax\n"
157+
158+
"iretq\n"
159+
160+
:
161+
: "m"(hcs.ss), "m"(hcs.rsp), "m"(hcs.rflags),
162+
"m"(hcs.cs), "m"(hcs.rip), "D"(vcpu_regs)
163+
: "memory", "cc"
164+
);
165+
}
166+
STACK_FRAME_NON_STANDARD(pkvm_vmx_reprivilege_cpu);

arch/x86/kvm/pkvm/vmx/host_vmx.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ static struct pkvm_init_ops vmx_init_ops = {
3434
.host_mmu_init = pkvm_host_ept_init,
3535
.host_mmu_finalize = pkvm_host_ept_finalize,
3636
.hyp_global_init = pkvm_vmx_init,
37+
.reprivilege_cpu = pkvm_vmx_reprivilege_cpu,
3738
};
3839

3940
static void skip_emulated_instruction(void)

arch/x86/kvm/pkvm/vmx/host_vmx.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,6 @@ static inline void request_host_immediate_exit(struct vcpu_vmx *vmx)
1212

1313
void pkvm_host_vmexit_main(struct vcpu_vmx *vmx);
1414

15+
void pkvm_vmx_reprivilege_cpu(unsigned long *vcpu_regs);
16+
1517
#endif /* __PKVM_VMX_HOST_VMX_H */

arch/x86/kvm/vmx/pkvm_init.c

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1123,7 +1123,49 @@ static __init int pkvm_hyp_init(void)
11231123

11241124
static __init void pkvm_host_reprivilege_cpu(void *data)
11251125
{
1126-
/* TODO: Implement reprivilege logic. */
1126+
unsigned long flags;
1127+
int cpu = get_cpu();
1128+
int ret;
1129+
1130+
if (!this_cpu_read(deprivileged)) {
1131+
put_cpu();
1132+
return;
1133+
}
1134+
1135+
local_irq_save(flags);
1136+
1137+
/*
1138+
* Load the RW GDT page for reprivilege code
1139+
* to reload TR.
1140+
*/
1141+
load_direct_gdt(cpu);
1142+
1143+
/*
1144+
* Intel CET requires indirect jmp/call to return to
1145+
* endbr64 instruction. So we can't use kvm_hypercall
1146+
* here.
1147+
*/
1148+
asm volatile(
1149+
"vmcall\n"
1150+
"endbr64\n"
1151+
: "=a"(ret)
1152+
: "a"(__pkvm__reprivilege_cpu)
1153+
: "memory");
1154+
1155+
/* Switch back to RO GDT page */
1156+
load_fixmap_gdt(cpu);
1157+
1158+
if (!ret) {
1159+
this_cpu_write(deprivileged, false);
1160+
kvm_cpu_vmxoff();
1161+
pr_info("%s: CPU%d back in host mode\n", __func__, cpu);
1162+
} else {
1163+
pr_warn("%s: CPU%d failed to reprivilege(err=%d)\n", __func__, cpu, ret);
1164+
}
1165+
1166+
local_irq_restore(flags);
1167+
1168+
put_cpu();
11271169
}
11281170

11291171
static __init void pkvm_host_reprivilege_cpus(void)

0 commit comments

Comments
 (0)