Skip to content

Commit 619f995

Browse files
author
Vineeth Pillai (Google)
committed
REORDER: pKVM: x86: VMX: CPU reprivilege functionality
Reprivilege feature allows host to boot as bare metal in case of pKVM initialization failure. All deprivileged cpus(VMX non-root mode) will be transitioned back to VMX root mode and then to non-VMX mode so that host can boot without pKVM. Signed-off-by: Vineeth Pillai (Google) <[email protected]>
1 parent 2a83fbe commit 619f995

File tree

5 files changed

+210
-3
lines changed

5 files changed

+210
-3
lines changed

arch/x86/kvm/pkvm/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ pkvm-hyp-y += $(kernel-lib)/sort.o $(kernel-lib)/bsearch.o \
3939
kvm := ..
4040
pkvm-hyp-y += $(kvm)/x86.o $(kvm)/cpuid.o
4141

42-
pkvm-hyp-$(CONFIG_PKVM_INTEL) += vmx/host_vmentry.o vmx/host_vmx.o \
42+
pkvm-hyp-$(CONFIG_PKVM_INTEL) += vmx/host_vmentry.o vmx/host_vmx.o vmx/host_repriv.o \
4343
$(kvm)/vmx/vmx.o vmx/idt.o vmx/ept.o \
4444
$(kvm)/vmx/main.o
4545

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
// SPDX-License-Identifier: GPL-2.0
2+
/*
3+
* Copyright (C) 2025 Google
4+
*/
5+
6+
#include <vmx/vmx.h>
7+
#include "host_vmx.h"
8+
#include "vcpu_regs.h"
9+
10+
struct host_cpu_state {
11+
unsigned long cr0, cr3, cr4;
12+
unsigned long rip, rsp;
13+
unsigned long rflags;
14+
unsigned long fsbase, gsbase;
15+
unsigned long long debugctl, perf_global_ctrl;
16+
unsigned short cs, ds, es, fs, gs, ss;
17+
18+
struct desc_ptr gdt, idt;
19+
};
20+
21+
static inline void read_host_cpu_state(struct host_cpu_state *hcs)
22+
{
23+
hcs->rsp = vmcs_readl(GUEST_RSP);
24+
hcs->rip = vmcs_readl(GUEST_RIP) + vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
25+
hcs->rflags = vmcs_readl(GUEST_RFLAGS);
26+
27+
hcs->ds = vmcs_read16(GUEST_DS_SELECTOR);
28+
hcs->es = vmcs_read16(GUEST_ES_SELECTOR);
29+
hcs->fs = vmcs_read16(GUEST_FS_SELECTOR);
30+
hcs->gs = vmcs_read16(GUEST_GS_SELECTOR);
31+
hcs->ss = vmcs_read16(GUEST_SS_SELECTOR);
32+
hcs->cs = vmcs_read16(GUEST_CS_SELECTOR);
33+
34+
hcs->fsbase = vmcs_readl(GUEST_FS_BASE);
35+
hcs->gsbase = vmcs_readl(GUEST_GS_BASE);
36+
37+
hcs->gdt.address = vmcs_readl(GUEST_GDTR_BASE);
38+
hcs->gdt.size = vmcs_read32(GUEST_GDTR_LIMIT);
39+
hcs->idt.address = vmcs_readl(GUEST_IDTR_BASE);
40+
hcs->idt.size = vmcs_read32(GUEST_IDTR_LIMIT);
41+
42+
hcs->debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
43+
hcs->perf_global_ctrl = vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL);
44+
45+
hcs->cr0 = vmcs_readl(GUEST_CR0);
46+
hcs->cr3 = vmcs_readl(GUEST_CR3);
47+
hcs->cr4 = vmcs_readl(GUEST_CR4);
48+
}
49+
50+
#define PKVM_WRITE_CR(crnum, val) \
51+
static inline void __pkvm_write_cr##crnum(unsigned long val) \
52+
{ \
53+
asm volatile("mov %0,%%cr" #crnum : "+r" (val) : : "memory"); \
54+
}
55+
56+
PKVM_WRITE_CR(0, val)
57+
PKVM_WRITE_CR(3, val)
58+
PKVM_WRITE_CR(4, val)
59+
60+
/*
61+
* Restores register state from memory pointed by rdi
62+
* offset: offset of register backup in memory
63+
* dest_reg: register to be restored.
64+
*/
65+
#define STRINGIFY(x) #x
66+
#define RESTORE_VCPU_REG(offset, dest_reg) \
67+
"mov " STRINGIFY(offset) "(%%rdi), %%" #dest_reg "\n"
68+
69+
70+
static inline void restore_host_special_regs(struct host_cpu_state *hcs)
71+
{
72+
struct desc_struct *gdt_desc;
73+
tss_desc *tss;
74+
75+
/* Reset the busy bit to reload TR */
76+
gdt_desc = (struct desc_struct *)(hcs->gdt.address);
77+
tss = (tss_desc *)&gdt_desc[GDT_ENTRY_TSS];
78+
tss->type = DESC_TSS;
79+
80+
__pkvm_write_cr4(hcs->cr4);
81+
__pkvm_write_cr0(hcs->cr0);
82+
__pkvm_write_cr3(hcs->cr3);
83+
84+
asm volatile (
85+
"lgdt %0\n"
86+
"lidt %1\n"
87+
"ltr %w2\n"
88+
"mov %3, %%ds\n"
89+
"mov %4, %%es\n"
90+
"mov %5, %%fs\n"
91+
"mov %6, %%gs\n"
92+
93+
:
94+
: "m"(hcs->gdt), "m"(hcs->idt), "q"(GDT_ENTRY_TSS*8),
95+
"m"(hcs->ds), "m"(hcs->es), "m"(hcs->fs), "m"(hcs->gs)
96+
: "memory"
97+
);
98+
99+
wrmsrl(MSR_FS_BASE, hcs->fsbase);
100+
wrmsrl(MSR_GS_BASE, hcs->gsbase);
101+
102+
wrmsrq(MSR_IA32_DEBUGCTLMSR, hcs->debugctl);
103+
wrmsrq(MSR_CORE_PERF_GLOBAL_CTRL, hcs->perf_global_ctrl);
104+
}
105+
106+
/*
107+
* Restores host cpu state and returns to host in VMX root mode.
108+
*/
109+
void pkvm_vmx_reprivilege_cpu(unsigned long *vcpu_regs)
110+
{
111+
static struct host_cpu_state hcs;
112+
113+
read_host_cpu_state(&hcs);
114+
restore_host_special_regs(&hcs);
115+
116+
asm volatile(
117+
/* Update stack as expected by iretq */
118+
"pushq %0\n"
119+
"pushq %1\n"
120+
"pushq %2\n"
121+
"pushq %3\n"
122+
"pushq %4\n"
123+
124+
/* Restore general purpose registers */
125+
RESTORE_VCPU_REG(VCPU_RCX, rcx)
126+
RESTORE_VCPU_REG(VCPU_RDX, rdx)
127+
RESTORE_VCPU_REG(VCPU_RBX, rbx)
128+
RESTORE_VCPU_REG(VCPU_RBP, rbp)
129+
RESTORE_VCPU_REG(VCPU_RSI, rsi)
130+
RESTORE_VCPU_REG(VCPU_R8, r8)
131+
RESTORE_VCPU_REG(VCPU_R9, r9)
132+
RESTORE_VCPU_REG(VCPU_R10, r10)
133+
RESTORE_VCPU_REG(VCPU_R11, r11)
134+
RESTORE_VCPU_REG(VCPU_R12, r12)
135+
RESTORE_VCPU_REG(VCPU_R13, r13)
136+
RESTORE_VCPU_REG(VCPU_R14, r14)
137+
RESTORE_VCPU_REG(VCPU_R15, r15)
138+
139+
/* Restore RDI (last!) */
140+
RESTORE_VCPU_REG(VCPU_RDI, rdi)
141+
142+
/*
143+
* We are not technically returning from the hypercall, but set
144+
* RAX to zero to indicate to host that reprivilege succeeded.
145+
*/
146+
"xor %%rax, %%rax\n"
147+
148+
"iretq\n"
149+
150+
:
151+
: "m"(hcs.ss), "m"(hcs.rsp), "m"(hcs.rflags),
152+
"m"(hcs.cs), "m"(hcs.rip), "D"(vcpu_regs)
153+
: "memory", "cc"
154+
);
155+
}
156+
STACK_FRAME_NON_STANDARD(pkvm_vmx_reprivilege_cpu);

arch/x86/kvm/pkvm/vmx/host_vmx.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ static struct pkvm_init_ops vmx_init_ops = {
3434
.host_mmu_init = pkvm_host_ept_init,
3535
.host_mmu_finalize = pkvm_host_ept_finalize,
3636
.hyp_global_init = pkvm_vmx_init,
37+
.reprivilege_cpu = pkvm_vmx_reprivilege_cpu,
3738
};
3839

3940
static void skip_emulated_instruction(void)

arch/x86/kvm/pkvm/vmx/host_vmx.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,6 @@ static inline void request_host_immediate_exit(struct vcpu_vmx *vmx)
1212

1313
void pkvm_host_vmexit_main(struct vcpu_vmx *vmx);
1414

15+
void pkvm_vmx_reprivilege_cpu(unsigned long *vcpu_regs);
16+
1517
#endif /* __PKVM_VMX_HOST_VMX_H */

arch/x86/kvm/vmx/pkvm_init.c

Lines changed: 50 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -971,6 +971,7 @@ static noinline int local_deprivilege_cpu(void)
971971
return ret;
972972
}
973973

974+
static DEFINE_PER_CPU(bool, deprivileged);
974975
static __init void pkvm_host_deprivilege_cpu(void *data)
975976
{
976977
struct pkvm_deprivilege_param *p = data;
@@ -1001,6 +1002,7 @@ static __init void pkvm_host_deprivilege_cpu(void *data)
10011002
}
10021003

10031004
vcpu->mode = IN_GUEST_MODE;
1005+
this_cpu_write(deprivileged, true);
10041006
pr_info("CPU%d in guest mode\n", cpu);
10051007
return;
10061008
vmxoff:
@@ -1106,7 +1108,53 @@ static __init int pkvm_hyp_init(void)
11061108

11071109
static __init void pkvm_host_reprivilege_cpu(void *data)
11081110
{
1109-
/* TODO: Implement reprivilege logic. */
1111+
struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
1112+
unsigned long flags;
1113+
int cpu = get_cpu();
1114+
int ret;
1115+
1116+
if (!this_cpu_read(deprivileged)) {
1117+
put_cpu();
1118+
return;
1119+
}
1120+
1121+
local_irq_save(flags);
1122+
1123+
/*
1124+
* Load the RW GDT page for reprivilege code
1125+
* to reload TR.
1126+
*/
1127+
load_direct_gdt(cpu);
1128+
1129+
/*
1130+
* Intel CET requires indirect jmp/call to return to
1131+
* endbr64 instruction. So we can't use kvm_hypercall
1132+
* here.
1133+
*/
1134+
asm volatile(
1135+
"vmcall\n"
1136+
"endbr64\n"
1137+
: "=a"(ret)
1138+
: "a"(__pkvm__reprivilege_cpu)
1139+
: "memory");
1140+
1141+
/*
1142+
* Switch back to RO GDT page
1143+
*/
1144+
load_fixmap_gdt(cpu);
1145+
1146+
if (!ret) {
1147+
this_cpu_write(deprivileged, false);
1148+
vcpu->mode = OUTSIDE_GUEST_MODE;
1149+
kvm_cpu_vmxoff();
1150+
pr_info("%s: CPU%d back in host mode\n", __func__, cpu);
1151+
} else {
1152+
pr_warn("%s: CPU%d failed to reprivilege(err=%d)\n", __func__, cpu, ret);
1153+
}
1154+
1155+
local_irq_restore(flags);
1156+
1157+
put_cpu();
11101158
}
11111159

11121160
static __init void pkvm_host_reprivilege_cpus(struct pkvm_hyp *pkvm)
@@ -1115,7 +1163,7 @@ static __init void pkvm_host_reprivilege_cpus(struct pkvm_hyp *pkvm)
11151163

11161164
for_each_possible_cpu(cpu) {
11171165
struct kvm_vcpu *vcpu = pkvm->host_vcpus[cpu];
1118-
if (vcpu->mode == OUTSIDE_GUEST_MODE)
1166+
if (!per_cpu(deprivileged, cpu))
11191167
continue;
11201168

11211169
smp_call_function_single(cpu, pkvm_host_reprivilege_cpu,

0 commit comments

Comments
 (0)